<a href="https://colab.research.google.com/github/KhoomeiK/MindMapResearch/blob/master/HAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DATA PREP STUFF

In [0]:
# download dataset and labels
! pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# downloaded1 = drive.CreateFile({'id': '1oZb283stxpZn8Dn8i8e2Vh6P8d6Voj4Y'}) 
# downloaded1.GetContentFile('dataset.zip')  

downloaded2 = drive.CreateFile({'id': '1-1nQU2lUwBnEyNot0EeVK72X92bdqVAu'}) 
downloaded2.GetContentFile('labels.pkl')

downloaded = drive.CreateFile({'id': '1-XYU2MCNbhS8ir_7ToC5DvrUMsQy9-9E'}) 
downloaded.GetContentFile('embeddings.zip')

# ! unzip dataset.zip
# ! rm -rf cse198f_shiv/data
# ! rm -rf cse198f_shiv/diagnostics
# ! rm -rf cse198f_shiv/models
# ! rm cse198f_shiv/vectors.py
# ! ls cse198f_shiv

In [0]:
# read dataset into memory
from os import listdir
from os.path import isfile, join
import pandas as pd

mypath = 'cse198f_shiv'
csvs = [f for f in listdir(mypath) if isfile(join(mypath, f))]
print(len(csvs))

data = []
names = []
for csv in csvs:
    if csv[-4:] == '.csv':
        try:
            data.append(pd.read_csv(join(mypath, csv), encoding='CP1252'))
            names.append(csv[:-4])
        except:
            try:
                data.append(pd.read_csv(join(mypath, csv), encoding='UTF8'))
                names.append(csv[:-4])
            except:
                continue
print(len(data))

# pd.reset_option('all')
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

2052
1980


In [0]:
# # download embedding tools
# ! ls *
# ! mkdir fastText
# ! curl https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip > fastText/crawl-300d-2M.vec.zip
# ! unzip fastText/crawl-300d-2M.vec.zip -d fastText/
# ! mkdir encoder
# ! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl
# ! curl https://raw.githubusercontent.com/facebookresearch/InferSent/master/models.py > models.py
# ! mkdir embeddings

In [0]:
# generate embeddings of dataset
import torch, os
import numpy as np
import tensorflow_hub as hub

from absl import logging
from models import InferSent

import pickle
import time

MODEL_PATH = 'encoder/infersent2.pkl'
W2V_PATH = 'fastText/crawl-300d-2M.vec'

def load_infersent_model(model_path=MODEL_PATH, word_embeddings_path=W2V_PATH):
	params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
	infersent = InferSent(params_model).cuda()
	infersent.load_state_dict(torch.load(model_path))
	infersent.set_w2v_path(word_embeddings_path)
	infersent.build_vocab_k_words(K=100000)
	return infersent

def get_infersent_vectors(sentences, model):
	return model.encode(sentences, tokenize=False, verbose=False)

def get_user_data_embeddings(comments, model):
	# model = load_infersent_model()
	embedding = get_infersent_vectors(comments, model)
	return embedding

dataEmbeddings = []
model = load_infersent_model()

start = time.time()
for i, name in enumerate(names):
	comments = list(data[i]['text']) if 'text' in data[i] else []
	if len(comments) >= 1:
		print(i, name)
		try:
			embeddings = get_user_data_embeddings(data[i]['text'], model)
			print(len(embeddings), 'comments')
			dataEmbeddings.append(embeddings)
			with open('embeddings/%s.pkl' % name, 'wb') as pkl:
				pickle.dump(embeddings, pkl)
		except:
			print('ERROR')

print(time.time() - start)

## MODEL RUN

In [15]:
# load labels and user embeddings and create Users data obj
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from csv import reader
import pickle
from os import listdir
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Users(Dataset):
    def __init__(self, embedPath, labelPath, split): 
        self.users, self.labels = [], []
        users = {}
        labels = pickle.load(open(labelPath, 'rb')) # { username: [ depressionPercent, vaderScore ] }
        fileList = listdir(embedPath)

        if type(split) == float:
            userList = random.sample(fileList, int(len(fileList) * split))
            for user in userList: # must download and unzip embeddings.zip first
                users[user[:-4]] = pickle.load(open('%s/%s' % (embedPath, user), 'rb'))
            
            intersection = list(set(users.keys()).intersection(set(labels.keys())))
            for i in intersection:
                userTensor = torch.tensor(users[i])
                if list(userTensor.shape)[0] != 1: # exclude single comment users
                    self.users.append(userTensor)
                    self.labels.append(torch.tensor(labels[i]))

            self.usernames = users.keys()
    
        elif type(split) == Users:
            for user in fileList: # must download and unzip embeddings.zip first
                if user[:-4] not in split.usernames:
                    users[user[:-4]] = pickle.load(open('%s/%s' % (embedPath, user), 'rb'))
            
            intersection = list(set(users.keys()).intersection(set(labels.keys())))
            for i in intersection:
                userTensor = torch.tensor(users[i])
                if list(userTensor.shape)[0] != 1: # exclude single comment users
                    self.users.append(userTensor)
                    self.labels.append(torch.tensor(labels[i]))
        
        else:
            print('ERROR')

        print(len(self), self[0])

    def __getitem__(self, i):
        return self.users[i], self.labels[i][0] # remove [0]
    
    def __len__(self):
        assert len(self.users) == len(self.labels)
        return len(self.users)

# ! unzip embeddings.zip
train = Users('embeddings', 'labels.pkl', 0.8)
test = Users('embeddings', 'labels.pkl', train)
train, test = DataLoader(train), DataLoader(test)

1387 (tensor([[ 7.4689e-03, -1.3480e-01, -4.2769e-02,  ..., -6.6668e-02,
         -4.1580e-02, -1.9568e-02],
        [ 7.4689e-03, -4.9213e-02, -7.3091e-02,  ...,  3.4554e-02,
         -4.8900e-02, -1.5373e-02],
        [ 7.4689e-03, -2.1242e-02, -7.1628e-02,  ..., -8.6076e-03,
          7.6879e-03, -4.9480e-02],
        ...,
        [ 7.4689e-03,  7.0529e-02,  1.0929e-01,  ...,  5.5297e-02,
          3.7981e-02, -6.8325e-03],
        [ 7.4689e-03, -1.5866e-02, -8.3607e-05,  ..., -2.0220e-02,
          1.3644e-02, -1.8471e-02],
        [ 7.4689e-03, -7.7452e-03,  5.7486e-02,  ...,  2.2026e-02,
          8.7726e-02, -1.9869e-02]]), tensor(0.0047))
346 (tensor([[ 0.0075, -0.0341,  0.0681,  ...,  0.0356,  0.0389, -0.0088],
        [ 0.0075, -0.0313,  0.1253,  ...,  0.0469,  0.0337,  0.0126],
        [ 0.0075, -0.0559, -0.0130,  ...,  0.0399,  0.0449, -0.0196],
        ...,
        [ 0.0075,  0.0356,  0.0487,  ...,  0.0215,  0.0350, -0.0248],
        [ 0.0075, -0.0140, -0.0516,  ...,  0.00

In [0]:
def matrix_mul(input, weight, bias=False):
    feature_list = []
    _input = input.squeeze(0)
    for feature in _input:
        feature = feature.unsqueeze(1).reshape(1, 300)
        f = torch.mm(feature, weight)
        if isinstance(bias, torch.nn.parameter.Parameter):
            feature = feature + bias.expand(feature.size()[0], bias.size()[1])
        feature = torch.tanh(feature).unsqueeze(0)
        feature_list.append(feature)

    return torch.cat(feature_list, 0).squeeze(1)

def element_wise_mul(input1, input2):
    _input1 = input1.squeeze(0)
    feature_list = []
    for feature_1, feature_2 in zip(_input1, input2):
        feature = feature_1 * feature_2
        feature_list.append(feature.unsqueeze(0))

    output = torch.cat(feature_list, 0)
    return torch.sum(output, 0).unsqueeze(0)

class HAN(nn.Module):
    def __init__(
        self,
        batch_size=8,
        embedding_dimension=4096, # from glove to infersent?
        hidden_size=150, 
        n_layers=1, # multiple?
    ):
        super(HAN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.batch_size = batch_size

        self.sent_weight = nn.Parameter(torch.randn(2 * hidden_size, 2 * hidden_size))
        self.sent_bias = nn.Parameter(torch.randn(1, 2 * hidden_size))
        self.context_weight = nn.Parameter(torch.randn(2 * hidden_size, 1))

        self.gru = nn.GRU(embedding_dimension, hidden_size, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_size, 1)
        # self.sent_softmax = nn.Softmax()
        # self.fc_softmax = nn.Softmax()
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):
        self.sent_weight.data.normal_(mean, std)
        self.sent_bias.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, user):
        f_output, h_output = self.gru(user)
        output = matrix_mul(f_output, self.sent_weight, self.sent_bias)
        output = matrix_mul(output, self.context_weight).permute(1, 0)
        output = F.softmax(output)
        output = element_wise_mul(f_output, output.permute(1, 0)).squeeze(0)
        output = self.fc(output)

        return output #, h_output

In [131]:
model = HAN().to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.MSELoss()
train_losses, losses = [], []
model.train()

for epoch in range(3):
    total = 0
    for X, Y in train:
        X, Y = X.to(device), Y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = criterion(pred, Y)
        loss.backward()
        optimizer.step()

        total += 1
        losses.append(loss.item())
    
    epoch_loss = sum(losses) / total
    print(epoch_loss)
    train_losses.append(epoch_loss)



0.0006346032317407751
0.0006575405189248062
0.0006616856136274855


In [145]:
test_losses, losses = [], []
torch.no_grad()
total = 0

for X, Y in test:
    X, Y = X.to(device), Y.to(device)
    optimizer.zero_grad()
    pred = model(X)
    loss = criterion(pred, Y)
    # print('pred', pred.item())
    # print('targ', Y.item())
    # print('loss', loss.item())

    total += 1
    losses.append(loss.item())

    # if total == 50:
    #     break

epoch_loss = sum(losses) / total
print(epoch_loss)
test_losses.append(epoch_loss)



4.271625992840745e-06
