<a href="https://colab.research.google.com/github/KhoomeiK/MindMapResearch/blob/master/HAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## MODEL DEFINITION

In [0]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from csv import reader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
class Users(Dataset):
    def __init__(self, path): # TODO: data reading
      docs = reader(path)
      # docs = [[[[infersentVector, ..., sentN], ..., postN], label0], ..., [userN, labelN]]
      for u, l in docs: # torchify
        self.users.append(u) # [[infersentVector, ..., sentN], ..., postN] (100 posts, each with arbitrary # vector sents)
        self.labels.append(l)
      
      print(len(self), self[0])

    def __getitem__(self, i):
      return self.users[i], self.labels[i]
    
    def __len__(self):
      assert len(self.users) == len(self.labels)
      return len(self.users)

In [0]:
class HAN(nn.Module):
    def __init__(
        self,
        batch_size=16,
        embedding_dimension=300, # from glove to infersent?
        hidden_size=150, 
        n_layers=1, # multiple?
    ):
        super(HAN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.batch_size = batch_size

        '''
        GRUforward(sentences)
        GRUbackward(sentences)
        concat
        attention(each sentence state) # basically a post vector
        GRUforward(post vectors)
        # backward?
        attention(each post state)
        linear output
        '''

        self.gru1 = nn.GRU( # pass through sent vecs
            embedding_dimension,
            hidden_size,
            num_layers=n_layers,
            bidirectional=True
        )

        # https://www.cc.gatech.edu/~dyang888/docs/naacl16.pdf
        self.mlp1 = nn.Linear(hidden_size * 2, hidden_size) # attention mlp
        self.contextW1 = nn.Parameter(torch.Tensor(hidden_size * 2, 1)) # attention weights

        self.gru2 = nn.GRU( # pass through post vecs
            1, # 1 summed val for each post
            hidden_size,
            num_layers=n_layers,
            bidirectional=True
        )

        self.mlp2 = nn.Linear(hidden_size * 2, hidden_size) # attention mlp
        self.contextW2 = nn.Parameter(torch.Tensor(hidden_size * 2, 1)) # attention weights
    
    def init_hidden(self):
        pass

    def forward(self, user):
        postVecs = torch.tensor([])
        for post in user:
            sentAnnot = self.gru1(post) # tensor of sentence annotations of post
            sentU = F.Tanh(self.mlp1(sentAnnot)) # hidden reps of sentAnnots
            sentA = F.Softmax(sentU * self.contextW1) # att weights of sents
            postVec = torch.sum(sentA * sentAnnot) # total val of post; 1 sum?
            torch.cat((postVecs, postVec))

        postAnnot = self.gru2(postVecs)
        postU = F.Tanh(self.mlp2(sentAnnot)) # hidden reps of postAnnots
        postA = F.Softmax(postU * self.contextW2) # att weights of posts
        userVec = torch.sum(postA * postAnnot) # total val of user; 1 sum?

        output = F.Softmax(userVec) # classify
        
        return output

## DATA PREP STUFF

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

fileId = '1oZb283stxpZn8Dn8i8e2Vh6P8d6Voj4Y'
downloaded = drive.CreateFile({'id': fileId}) 
downloaded.GetContentFile('dataset.zip')  

# ! unzip dataset.zip
# ! rm -rf cse198f_shiv/data
# ! rm -rf cse198f_shiv/diagnostics
# ! rm -rf cse198f_shiv/models
# ! rm cse198f_shiv/vectors.py
# ! ls cse198f_shiv

In [0]:
from os import listdir
from os.path import isfile, join
import pandas as pd

mypath = 'cse198f_shiv'
csvs = [f for f in listdir(mypath) if isfile(join(mypath, f))]
print(len(csvs))

data = []
names = []
for csv in csvs:
    if csv[-4:] == '.csv':
        try:
            data.append(pd.read_csv(join(mypath, csv), encoding='CP1252'))
            names.append(csv[:-4])
        except:
            try:
                data.append(pd.read_csv(join(mypath, csv), encoding='UTF8'))
                names.append(csv[:-4])
            except:
                continue
print(len(data))

# pd.reset_option('all')
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

2052
1980


In [0]:
# ! ls *
# ! mkdir fastText
# ! curl https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip > fastText/crawl-300d-2M.vec.zip
# ! unzip fastText/crawl-300d-2M.vec.zip -d fastText/
# ! mkdir encoder
# ! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl
# ! curl https://raw.githubusercontent.com/facebookresearch/InferSent/master/models.py > models.py

In [0]:
import torch, os
import numpy as np
import tensorflow_hub as hub

from absl import logging
from models import InferSent
# from users_data_clean import get_user_data

MODEL_PATH = 'encoder/infersent2.pkl'
# USE_MODEL_PATH = 'google_use/4.tar.gz'
W2V_PATH = 'fastText/crawl-300d-2M.vec'

def load_infersent_model(model_path=MODEL_PATH, word_embeddings_path=W2V_PATH):
	params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
	infersent = InferSent(params_model).cuda()
	infersent.load_state_dict(torch.load(model_path))
	infersent.set_w2v_path(word_embeddings_path)
	infersent.build_vocab_k_words(K=100000)
	return infersent

def get_infersent_vectors(sentences, model):
	return model.encode(sentences, tokenize=False, verbose=False)
 
# def load_google_use_model(model_path=USE_MODEL_PATH):
# 	# return hub.Module("https://tfhub.dev/google/universal_sentence_encoder/1")
# 	return hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# def get_google_use_vectors(sentences, model):
# 	logging.set_verbosity(logging.ERROR)
# 	message_embeddings = model(sentences)
# 	result = np.array(message_embeddings).tolist()
# 	return result

def get_user_data_embeddings(data):
	model = load_infersent_model()
	embedded = data[:64] # make all
	with open('embeddings.data', 'w') as file:
		for ct, user in enumerate(embedded):
			try:
				embeddings = []
				for sent in list(user['text']):
					embedding = get_infersent_vectors(sent, model)
					embeddings.append(embedding)
				file.write('%s %s' % (names[ct], embeddings))
				user.insert(5, 'embeddings', embeddings, True)
				print("USER", ct)
			except:
				continue
		return embedded
	# return [user.insert(5, 'embeddings', [get_infersent_vectors(sent, model) for sent in list(user['text'])], True) for user in embedded]

embedded = get_user_data_embeddings(data)
print(embedded[0])

In [0]:
# init Users
# data = tqdm_notebook(DataLoader(), leave=False)
# l = embedded[0]['embeddings'][0][0]
# t = torch.FloatTensor(l)

tensor([ 0.0075, -0.0278, -0.0668,  ..., -0.0105, -0.0648,  0.0080])

## MODEL RUN

In [0]:
model = HAN().to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.MSELoss()
losses = []
model.train()

for epoch in range(4):
    total = 0
    for X, Y in data:
        optimizer.zero_grad()
        pred = model(X)
        loss = criterion(pred, Y)
        loss.backward()
        optimizer.step()

        total += 1
        losses.append(loss.item())
        data.set_description(f'Loss: {loss.item():.3f}')
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)

    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')