In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from math import ceil, floor
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import os
from argparse import Namespace

import ast

from gensim.models import Word2Vec

flags = Namespace(
    train_file='dv9ogm_corpus.txt',
    seq_size=8,
    batch_size=64,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['<c>', '<s>'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
) 

In [143]:
class SubmissionCorpusDataset(Dataset):
    def __init__(self, corpus_filepath, window_size):
        with open(corpus_filepath, 'r') as f:
            text_dict = f.read()
    
        self.corpus = ast.literal_eval(text_dict)
        self.judgement_categories = self.corpus.keys()
        self.window_size = window_size
        self.embedding_size = 100
        
        self.all_comments = []
        all_words = []
        for judgement_category in self.judgement_categories:
            for (comment_score, comment_body) in self.corpus[judgement_category]:
                self.all_comments.append(comment_body)
                for token in comment_body:
                    all_words.append(token)
        self.w2v = Word2Vec(self.all_comments, min_count=1)
        self.vocab_size = len(self.w2v.wv.vocab)
    
    def get_w2v(self):
        return(self.w2v)
    
    def vectorize_comment(self, comment):
        vectorized_comment = np.array(list(map(lambda token: self.w2v.wv[token], comment)))
        #Sanity check
        vectorized_comment.reshape((len(comment),self.embedding_size))
        return(vectorized_comment)
    
    def apply_windowing(self, vectorized_comment):
        num_windows = vectorized_comment.shape[0]-self.window_size
        windows = np.zeros((num_windows, self.window_size-1, self.embedding_size))
        y = np.zeros((num_windows, self.embedding_size))
        
        for i in range(num_windows):
            windows[i,:,:] = vectorized_comment[i:i+self.window_size-1,:]
            y[i,:] = vectorized_comment[i+self.window_size,:]
        
        return(windows,y)
    
    def __len__(self):
        return(len(self.all_comments))

    def __getitem__(self, index):
        comment = self.all_comments[index]
        vectorized_comment = self.vectorize_comment(comment)
        X,y = self.apply_windowing(vectorized_comment)
        X_tensor = torch.DoubleTensor(X)
        y_tensor = torch.DoubleTensor(y)
        return(X_tensor,y_tensor)

In [144]:
window_size=8
corpus_filename = 'dv9ogm_corpus.txt'
corpus_dataset = SubmissionCorpusDataset(corpus_filename,window_size=window_size)

In [145]:
class CommentGRU(nn.Module):
    def __init__(self, window_size, embedding_size, gru_size):
        super(CommentGRU, self).__init__()
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.gru_size = gru_size
        
        self.gru = nn.GRU(embedding_size,
                          gru_size,
                          batch_first=True)
        
        self.dense = nn.Linear(gru_size, embedding_size)
        
    def forward(self, x):
        print(type(x))
        output, state = self.gru(x.double())
        logits = self.dense(output)
        #return(logits, state)
        return(logits)

In [146]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))

In [147]:
class CommentGeneratorGRU():
    def __init__(self, comment_dataset, gru_size, embedding_size=100, window_size=8):
        self.comment_dataset = comment_dataset
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.w2v = self.comment_dataset.get_w2v()
        self.model = CommentGRU(window_size-1, embedding_size, gru_size)
    
    def train(self, epochs, batch_size):
        trainloader = torch.utils.data.DataLoader(self.comment_dataset, batch_size=batch_size, shuffle=True)
        for epoch in range(epochs):
            #criterion = nn.CrossEntropyLoss()
            criterion = nn.CosineEmbeddingLoss()
            optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
            
            # TRAINING ROUND
            for i, (batch_data, batch_labels) in enumerate(trainloader):
                batch_data = Variable(batch_data)
                #batch_labels = Variable(batch)
                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward + backward + optimize
                outputs = self.model(batch_data.view((-1, self.window_size-1, self.embedding_size)))

                loss = criterion(outputs, batch_labels.view(-1, self.embedding_size))
                loss.backward()
                optimizer.step()

                train_running_loss += loss.detach().item()
                train_acc += get_accuracy(outputs, labels, batch_size)

            model.eval()
            print('Epoch:  %d | Loss: %.4f | Train Accuracy: %.2f' 
                  %(epoch, train_running_loss / i, train_acc/i))
                

In [148]:
comment_generator = CommentGeneratorGRU(corpus_dataset, gru_size=32, embedding_size=100, window_size=8)

In [149]:
comment_generator.train(epochs=1, batch_size=1)

NameError: name 'Variable' is not defined

In [53]:
'''
class SubmissionCorpusDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, corpus, root_dir):
        self.corpus = corpus
        self.judgement_categories = corpus.keys()
        self.vocabulary_size
        self.embedding_size
        
    def build_word2vec(self):
        all_comments = []
        all_words = []
        category_inds = np.arange(len(self.judgements))
        for category_ind in category_inds:
            for comment_ind in np.arange(len(self.comment_corpus[self.judgements[category_ind]])):
                all_comments.append(self.comment_corpus[self.judgements[category_ind]][comment_ind])
                #for token in self.comment_corpus[self.judgements[category_ind]][comment_ind]:
                #    all_words.append(token)

        return(Word2Vec(all_comments, min_count=1))

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.landmarks_frame.iloc[idx, 0])
        image = io.imread(img_name)
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        landmarks = np.array([landmarks])
        landmarks = landmarks.astype('float').reshape(-1, 2)
        sample = {'image': image, 'landmarks': landmarks}

        if self.transform:
            sample = self.transform(sample)

        return sample
'''

'\nclass SubmissionCorpusDataset(Dataset):\n    """Face Landmarks dataset."""\n\n    def __init__(self, corpus, root_dir):\n        self.corpus = corpus\n        self.judgement_categories = corpus.keys()\n        self.vocabulary_size\n        self.embedding_size\n        \n    def build_word2vec(self):\n        all_comments = []\n        all_words = []\n        category_inds = np.arange(len(self.judgements))\n        for category_ind in category_inds:\n            for comment_ind in np.arange(len(self.comment_corpus[self.judgements[category_ind]])):\n                all_comments.append(self.comment_corpus[self.judgements[category_ind]][comment_ind])\n                #for token in self.comment_corpus[self.judgements[category_ind]][comment_ind]:\n                #    all_words.append(token)\n\n        return(Word2Vec(all_comments, min_count=1))\n\n    def __len__(self):\n        return len(self.landmarks_frame)\n\n    def __getitem__(self, idx):\n        if torch.is_tensor(idx):\n       