<a href="https://colab.research.google.com/github/KhoomeiK/MindMapResearch/blob/master/HAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DATA PREP STUFF

In [0]:
# download dataset and labels
! pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# downloaded1 = drive.CreateFile({'id': '1oZb283stxpZn8Dn8i8e2Vh6P8d6Voj4Y'}) 
# downloaded1.GetContentFile('dataset.zip')  

# ! unzip dataset.zip
# ! rm -rf cse198f_shiv/data
# ! rm -rf cse198f_shiv/diagnostics
# ! rm -rf cse198f_shiv/models
# ! rm cse198f_shiv/vectors.py
# ! ls cse198f_shiv

downloaded2 = drive.CreateFile({'id': '1-1nQU2lUwBnEyNot0EeVK72X92bdqVAu'}) 
downloaded2.GetContentFile('labels.pkl')

downloaded = drive.CreateFile({'id': '1-XYU2MCNbhS8ir_7ToC5DvrUMsQy9-9E'}) 
downloaded.GetContentFile('embeddings.zip')

# ! unzip embeddings.zip

In [0]:
# read dataset into memory
from os import listdir
from os.path import isfile, join
import pandas as pd

mypath = 'cse198f_shiv'
csvs = [f for f in listdir(mypath) if isfile(join(mypath, f))]
print(len(csvs))

data = []
names = []
for csv in csvs:
    if csv[-4:] == '.csv':
        try:
            data.append(pd.read_csv(join(mypath, csv), encoding='CP1252'))
            names.append(csv[:-4])
        except:
            try:
                data.append(pd.read_csv(join(mypath, csv), encoding='UTF8'))
                names.append(csv[:-4])
            except:
                continue
print(len(data))

# pd.reset_option('all')
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

2052
1980


In [0]:
# # download embedding tools
# ! ls *
# ! mkdir fastText
# ! curl https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip > fastText/crawl-300d-2M.vec.zip
# ! unzip fastText/crawl-300d-2M.vec.zip -d fastText/
# ! mkdir encoder
# ! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl
# ! curl https://raw.githubusercontent.com/facebookresearch/InferSent/master/models.py > models.py
# ! mkdir embeddings

In [0]:
# generate embeddings of dataset
import torch, os
import numpy as np
import tensorflow_hub as hub

from absl import logging
from models import InferSent

import pickle
import time

MODEL_PATH = 'encoder/infersent2.pkl'
W2V_PATH = 'fastText/crawl-300d-2M.vec'

def load_infersent_model(model_path=MODEL_PATH, word_embeddings_path=W2V_PATH):
	params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
	infersent = InferSent(params_model).cuda()
	infersent.load_state_dict(torch.load(model_path))
	infersent.set_w2v_path(word_embeddings_path)
	infersent.build_vocab_k_words(K=100000)
	return infersent

def get_infersent_vectors(sentences, model):
	return model.encode(sentences, tokenize=False, verbose=False)

def get_user_data_embeddings(comments, model):
	# model = load_infersent_model()
	embedding = get_infersent_vectors(comments, model)
	return embedding

dataEmbeddings = []
model = load_infersent_model()

start = time.time()
for i, name in enumerate(names):
	comments = list(data[i]['text']) if 'text' in data[i] else []
	if len(comments) >= 1:
		print(i, name)
		try:
			embeddings = get_user_data_embeddings(data[i]['text'], model)
			print(len(embeddings), 'comments')
			dataEmbeddings.append(embeddings)
			with open('embeddings/%s.pkl' % name, 'wb') as pkl:
				pickle.dump(embeddings, pkl)
		except:
			print('ERROR')

print(time.time() - start)

## MODEL RUN

In [0]:
# load labels and user embeddings and create Users data obj
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from csv import reader
import pickle
from os import listdir
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Users(Dataset):
    def __init__(self, embedPath, labelPath, split): 
        self.users, self.labels = [], []
        users = {}
        labels = pickle.load(open(labelPath, 'rb')) # { username: [ depressionPercent, vaderScore ] }
        fileList = listdir(embedPath)

        if type(split) == float:
            userList = random.sample(fileList, int(len(fileList) * split))
            for user in userList: # must download and unzip embeddings.zip first
                users[user[:-4]] = pickle.load(open('%s/%s' % (embedPath, user), 'rb'))
            
            intersection = list(set(users.keys()).intersection(set(labels.keys())))
            for i in intersection:
                userTensor = torch.tensor(users[i])
                if list(userTensor.shape)[0] != 1: # exclude single comment users
                    self.users.append(userTensor)
                    self.labels.append(torch.tensor(labels[i]))

            self.usernames = users.keys()
    
        elif type(split) == Users:
            for user in fileList: # must download and unzip embeddings.zip first
                if user[:-4] not in split.usernames:
                    users[user[:-4]] = pickle.load(open('%s/%s' % (embedPath, user), 'rb'))
            
            intersection = list(set(users.keys()).intersection(set(labels.keys())))
            for i in intersection:
                userTensor = torch.tensor(users[i])
                if list(userTensor.shape)[0] != 1: # exclude single comment users
                    self.users.append(userTensor)
                    self.labels.append(torch.tensor(labels[i] * 100))
        
        else:
            print('ERROR')

        print(len(self), self[0])

    def __getitem__(self, i):
        return self.users[i], self.labels[i][0] * 100 # remove [0]
    
    def __len__(self):
        assert len(self.users) == len(self.labels)
        return len(self.users)

def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    target = torch.Tensor(target)
    return [data, target]

train = Users('embeddings', 'labels.pkl', 0.8)
test = Users('embeddings', 'labels.pkl', train)
dataloader_params = {'shuffle': True}
# dataloader_params = {'batch_size': 4, 'shuffle': True, 'collate_fn': my_collate}
train, test = DataLoader(train, **dataloader_params), DataLoader(test, **dataloader_params)

1387 (tensor([[ 0.0075, -0.1444, -0.0628,  ..., -0.1045, -0.0613, -0.0468],
        [ 0.0075, -0.1444, -0.0628,  ..., -0.1045, -0.0613, -0.0468],
        [ 0.0075, -0.0414, -0.0156,  ...,  0.0176, -0.0470,  0.0129],
        ...,
        [ 0.0075, -0.1234, -0.0439,  ..., -0.0574, -0.0229,  0.0077],
        [ 0.0075, -0.1444, -0.0628,  ..., -0.1045, -0.0613, -0.0468],
        [ 0.0075,  0.0129, -0.0454,  ...,  0.0199, -0.0122, -0.0152]]), tensor(0.4477))
346 (tensor([[ 0.0075, -0.0605, -0.0110,  ..., -0.0185,  0.0840, -0.0117],
        [ 0.0075,  0.1375,  0.1038,  ...,  0.0616,  0.0318,  0.0148],
        [ 0.0075, -0.0453,  0.0519,  ...,  0.0325, -0.0033, -0.0314],
        ...,
        [ 0.0075, -0.0919, -0.0320,  ..., -0.0037,  0.0052, -0.0085],
        [ 0.0075, -0.0738, -0.0455,  ..., -0.0152, -0.0375, -0.0201],
        [ 0.0075,  0.0739,  0.1000,  ...,  0.0450,  0.0023, -0.0258]]), tensor(0.4408))


In [0]:
def matrix_mul(input, weight, bias=False):
    feature_list = []
    _input = input.squeeze(0)
    for feature in _input:
        feature = feature.unsqueeze(1).reshape(1, 300)
        feature = torch.mm(feature, weight)
        if isinstance(bias, torch.nn.parameter.Parameter):
            feature = feature + bias.expand(feature.size()[0], bias.size()[1])
        feature = torch.tanh(feature).unsqueeze(0)
        feature_list.append(feature)

    return torch.cat(feature_list, 0).squeeze(1)

def element_wise_mul(input1, input2):
    _input1 = input1.squeeze(0)
    feature_list = []
    for feature_1, feature_2 in zip(_input1, input2):
        feature = feature_1 * feature_2
        feature_list.append(feature.unsqueeze(0))

    output = torch.cat(feature_list, 0)
    return torch.sum(output, 0).unsqueeze(0)

class HAN(nn.Module):
    def __init__(
        self,
        batch_size=4,
        embedding_dimension=4096, # from glove to infersent?
        hidden_size=150, 
        n_layers=1, # multiple?
    ):
        super(HAN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.batch_size = batch_size

        self.sent_weight = nn.Parameter(torch.randn(2 * hidden_size, 2 * hidden_size))
        self.sent_bias = nn.Parameter(torch.randn(1, 2 * hidden_size))
        self.context_weight = nn.Parameter(torch.randn(2 * hidden_size, 1))

        self.gru = nn.GRU(embedding_dimension, hidden_size, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_size, 1) 
        self._create_weights(mean=0.005)

    def _create_weights(self, mean=0.0, std=0.01):
        self.sent_weight.data.normal_(mean, std)
        self.sent_bias.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, user):
        f_output, h_output = self.gru(user)
        output = matrix_mul(f_output, self.sent_weight, self.sent_bias)
        output = matrix_mul(output, self.context_weight).permute(1, 0)
        output = F.softmax(output)
        output = element_wise_mul(f_output, output.permute(1, 0)).squeeze(0)
        output = self.fc(output)
        output = F.leaky_relu(output)

        return output #, h_output

In [0]:
model = HAN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.MSELoss() # nn.BCELoss()
epochs = 10
model.train()

train_losses = []
for epoch in range(epochs):
    total = 0
    losses = []
    for X, Y in train:
        # loss = 0
        # X, Y = [x.unsqueeze(0).to(device) for x in X], [y.to(device) for y in Y]
        # for x, y in zip(X, Y):
        #     pred = model(x)
        #     # print(pred, y)
        #     if loss == 0:
        #         loss = criterion(pred, y)
        #     else:
        #         loss += criterion(pred, y)
        # loss /= 4
        X, Y = X.to(device), Y.to(device) # torch.FloatTensor(([0.0] if Y.item() < 0.41 else [1.0])).to(device)
        pred = model(X)
        loss = criterion(pred, Y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        total += 1

        if total % 200 == 0:
            print(pred.item(), Y.item(), loss.item())
    
    epoch_loss = sum(losses) / total
    print(epoch, epoch_loss)
    train_losses.append(epoch_loss)

print(train_losses)



0.3716188967227936 0.35443854331970215 0.0002951645292341709
0.5040928721427917 0.46450501680374146 0.0015671983128413558
0.42707765102386475 0.3636363744735718 0.004024795722216368
0.38586175441741943 0.465646892786026 0.006365668494254351
0.4353763163089752 0.4037621021270752 0.0009994584834203124
0.3826005458831787 0.24485798180103302 0.018973013386130333
0.458107590675354 0.4326900541782379 0.0006460511358454823
0.4417169690132141 0.46510547399520874 0.0005470221512950957
0.351027250289917 0.359570175409317 7.298157288460061e-05
0.3529101610183716 0.30369848012924194 0.0024217895697802305
0.3943758010864258 0.35791105031967163 0.00132967799436301
0.3796095848083496 0.4725065529346466 0.008629846386611462
0.4237464666366577 0.2994723618030548 0.015444053336977959
0.38918307423591614 0.44595348834991455 0.0032228799536824226
0.3935449421405792 0.46333834528923035 0.004871119279414415
0.38462355732917786 0.45765116810798645 0.005333031993359327
0.35553690791130066 0.408041387796402 0.

In [0]:
# from matplotlib import pyplot as plt

# plt.xticks(range(len(train_losses)))
# plt.xlabel('epoch')
# plt.ylabel('loss')
# plt.plot(train_losses, '-ro')

# torch.save(model.state_dict(), 'model_save.pkl')
# upload = drive.CreateFile({'title': 'model_save.pkl'})
# upload.SetContentFile('model_save.pkl')
# upload.Upload()

In [0]:
torch.no_grad()
total = 0
losses = []

for X, Y in test:
    # X, Y = [x.unsqueeze(0).to(device) for x in X], [y.to(device) for y in Y]
    # for x, y in zip(X, Y):
    #     pred = model(x)
    #     loss = criterion(pred, y)
        
    #     # print('pred', pred.item())
    #     # print('targ', y.item())
    #     # print('loss', loss.item())
        
    #     total += 1
    #     losses.append(loss.item())

    X, Y = X.to(device), Y.to(device)
    pred = model(X)
    loss = criterion(pred, Y)

    losses.append(loss.item() if loss.item() < 1 else 0)
    total += 1
    
    # if total % 20 == 0:
    if loss.item() > 0.1:
        print(pred.item(), Y.item(), loss.item())

test_loss = sum(losses) / total
print(test_loss)

ERROR! Session/line number was not unique in database. History logging moved to new session 59
0.4764004349708557 0.9855071902275085 0.2591896951198578




0.609584629535675 0.23547010123729706 0.13996167480945587
0.6329982876777649 0.2545824646949768 0.14319853484630585
0.4782411456108093 1.1830201148986816 0.49671339988708496
0.4327390789985657 0.8695651888847351 0.19081704318523407
0.6160134673118591 0.21960419416427612 0.15714031457901
0.4969024658203125 1.4568158388137817 0.921433687210083
0.5348581671714783 1.2681158781051636 0.5376668572425842
0.7009385824203491 0.3614157438278198 0.11527575552463531
0.5761994123458862 3.4042551517486572 7.997900009155273
0.4319576621055603 0.8669046759605408 0.18917889893054962
0.02323049226526953
