In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from math import ceil, floor
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import os
from argparse import Namespace

import ast

from gensim.models import Word2Vec

flags = Namespace(
    train_file='dv9ogm_corpus.txt',
    seq_size=8,
    batch_size=64,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['<c>', '<s>'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
) 

In [None]:
class SubmissionCorpusDataset(Dataset):
    def __init__(self, corpus, root_dir, window_size):
        self.corpus = corpus
        self.judgement_categories = corpus.keys()
        self.vocabulary_size
        self.embedding_size
        self.window_size = window_size
        self.embedding_size = 100
        
        self.all_comments = []
        all_words = []
        for judgement_category in judgement_categories:
            for (comment_score, comment_body) in corpus[judgement_category]:
                self.all_comments.append(comment_body)
                for token in comment_body:
                    all_words.append(token)
        self.w2v = Word2Vec(self.all_comments, min_count=1))
        
        #word_counts = Counter(all_words)
        #sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
        #int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
        #vocab_to_int = {w: k for k, w in int_to_vocab.items()}
        #n_vocab = len(int_to_vocab)
    
    def get_w2v(self):
        return(self.w2v)
    
    def vectorize_comment(self, comment):
        vectorized_comment = np.array(list(map(lambda token: self.w2v.wv[token], comment)))
        #Sanity check
        vectorized_comment.reshape((len(comment),self.embedding_size))
        return(vectorized_comment)
    
    def apply_windowing(self, vectorized_comment):
        num_windows = vectorized_comment.shape[0]-self.window_size
        windows = np.zeros((num_widnows, self.window_size, self.embedding_size))
        y = np.zeros((num_windows, self.embedding_size))
        
        for i in range(num_windows):
            windows[i,:,:] = vectorized_comment[i:i+self.window_size-1,:]
            y[i,:] = vectorized_comment[i+self.window_size,:]
        
        return(windows,y)
    
    def __len__(self):
        return(len(self.all_comments))

    def __getitem__(self, index):
        comment = self.all_comments[index]
        vectorized_comment = self.vectorize_comment(sample_comments)
        X,y = apply_windowing(vectorized_comment)
        return(X,y)

In [2]:
def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r') as f:
        text_dict = f.read()
    
    corpus = ast.literal_eval(text_dict)
    judgement_categories = corpus.keys()
    all_comments = []
    all_words = []
    for judgement_category in judgement_categories:
        for (comment_score, comment_body) in corpus[judgement_category]:
            all_comments.append(comment_body)
            for token in comment_body:
                all_words.append(token)
    
    word_counts = Counter(all_words)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)
    
    int_text = [vocab_to_int[w] for w in all_words]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return(int_to_vocab, vocab_to_int, n_vocab, in_text, out_text)

In [3]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield(in_text[:, i:i+seq_size], out_text[:, i:i+seq_size])

In [4]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, gru_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.gru_size = gru_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.GRU(embedding_size,
                            gru_size,
                            batch_first=True)
        self.dense = nn.Linear(gru_size, n_vocab)
        
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return(logits, state)
    
    def zero_state(self, batch_size):
        return(torch.zeros(1, batch_size, self.lstm_size),
               torch.zeros(1, batch_size, self.lstm_size))

In [5]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return(criterion, optimizer)

In [6]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))

In [9]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(
        flags.train_file, flags.batch_size, flags.seq_size)

    net = RNNModule(n_vocab, flags.seq_size,
                    flags.embedding_size, flags.lstm_size)
    net = net.to(device)

    criterion, optimizer = get_loss_and_train_op(net, 0.01)

    iteration = 0
    
    
    for e in range(1000):
        batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
        state_h, state_c = net.zero_state(flags.batch_size)
        
        # Transfer data to GPU
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1
            
            # Tell it we are in training mode
            net.train()

            # Reset all gradients
            optimizer.zero_grad()

            # Transfer data to GPU
            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)

            logits, (state_h, state_c) = net(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss_value = loss.item()

            # Perform back-propagation
            loss.backward(retain_graph=True)

            # Update the network's parameters
            optimizer.step()
            
            loss.backward()

            _ = torch.nn.utils.clip_grad_norm_(
                net.parameters(), flags.gradients_norm)

            optimizer.step()
            
            if iteration % 100 == 0:
                print('Epoch: {}/{}'.format(e, 200),
                      'Iteration: {}'.format(iteration),
                      'Loss: {}'.format(loss_value))

            if iteration % 1000 == 0:
                predict(device, net, flags.initial_words, n_vocab,
                        vocab_to_int, int_to_vocab, top_k=5)
                torch.save(net.state_dict(),
                           'checkpoint_pt/model-{}.pth'.format(iteration))
    
    #predict(device, net, flags.initial_words, n_vocab, vocab_to_int, int_to_vocab, top_k=5)
                

In [10]:
main()

Epoch: 14/200 Iteration: 100 Loss: 0.8135014772415161
Epoch: 28/200 Iteration: 200 Loss: 0.3087753653526306
Epoch: 42/200 Iteration: 300 Loss: 0.14227239787578583
Epoch: 57/200 Iteration: 400 Loss: 0.05680321156978607
Epoch: 71/200 Iteration: 500 Loss: 0.0325569249689579
Epoch: 85/200 Iteration: 600 Loss: 0.02645493485033512
Epoch: 99/200 Iteration: 700 Loss: 0.0689525380730629
Epoch: 114/200 Iteration: 800 Loss: 0.021873164921998978
Epoch: 128/200 Iteration: 900 Loss: 0.012522201985120773
Epoch: 142/200 Iteration: 1000 Loss: 0.009632288478314877
<c> <s> I 'm not saying you 're the telling her body , inviting ( for her career third of her lavishly . HAS 'd have reacted the third , and your friends with you is - Your that made and 's `` some and `` keep womb with it does it does no kids because or reacted the more I she does , apologize all upset of your points never , because 's say a whole your weird fucking carried about trying , when nature you not understand that 's why would want 

Epoch: 442/200 Iteration: 3100 Loss: 0.7462729811668396
Epoch: 457/200 Iteration: 3200 Loss: 0.04539420083165169
Epoch: 471/200 Iteration: 3300 Loss: 0.0103163942694664
Epoch: 485/200 Iteration: 3400 Loss: 0.007112163584679365
Epoch: 499/200 Iteration: 3500 Loss: 0.006461644545197487
Epoch: 514/200 Iteration: 3600 Loss: 0.006300212815403938
Epoch: 528/200 Iteration: 3700 Loss: 0.004717531148344278
Epoch: 542/200 Iteration: 3800 Loss: 0.0070439293049275875
Epoch: 557/200 Iteration: 3900 Loss: 0.04547470435500145
Epoch: 571/200 Iteration: 4000 Loss: 0.004118445795029402
<c> <s> I 'm not saying you 're the telling her body , inviting ( for her career third of her lavishly . HAS 'd have reacted the third , and your friends with you is - Your that made and 's `` some and `` keep womb with it does it does no kids because or reacted the more I she does , apologize all upset of your points never , because 's say a whole your weird fucking carried about trying , when nature you not understand t

Epoch: 728/200 Iteration: 5100 Loss: 0.0005937307723797858
Epoch: 742/200 Iteration: 5200 Loss: 0.556297242641449
Epoch: 757/200 Iteration: 5300 Loss: 0.04635617136955261
Epoch: 771/200 Iteration: 5400 Loss: 0.014352145604789257
Epoch: 785/200 Iteration: 5500 Loss: 0.009756459854543209
Epoch: 799/200 Iteration: 5600 Loss: 0.006746279075741768
Epoch: 814/200 Iteration: 5700 Loss: 0.006420835852622986
Epoch: 828/200 Iteration: 5800 Loss: 0.006312592886388302
Epoch: 842/200 Iteration: 5900 Loss: 0.004382756073027849
Epoch: 857/200 Iteration: 6000 Loss: 0.034440841525793076
<c> <s> I 'm not saying you 're the telling her body , inviting ( for her career third of her lavishly . HAS 'd have reacted the third , and your friends with you is - Your that made and 's `` some and `` keep womb with it does it does no kids because or reacted the more I she does , apologize all upset of your points never , because 's say a whole your weird fucking carried about trying , when nature you not understand

In [53]:
'''
class SubmissionCorpusDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, corpus, root_dir):
        self.corpus = corpus
        self.judgement_categories = corpus.keys()
        self.vocabulary_size
        self.embedding_size
        
    def build_word2vec(self):
        all_comments = []
        all_words = []
        category_inds = np.arange(len(self.judgements))
        for category_ind in category_inds:
            for comment_ind in np.arange(len(self.comment_corpus[self.judgements[category_ind]])):
                all_comments.append(self.comment_corpus[self.judgements[category_ind]][comment_ind])
                #for token in self.comment_corpus[self.judgements[category_ind]][comment_ind]:
                #    all_words.append(token)

        return(Word2Vec(all_comments, min_count=1))

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.landmarks_frame.iloc[idx, 0])
        image = io.imread(img_name)
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        landmarks = np.array([landmarks])
        landmarks = landmarks.astype('float').reshape(-1, 2)
        sample = {'image': image, 'landmarks': landmarks}

        if self.transform:
            sample = self.transform(sample)

        return sample
'''

'\nclass SubmissionCorpusDataset(Dataset):\n    """Face Landmarks dataset."""\n\n    def __init__(self, corpus, root_dir):\n        self.corpus = corpus\n        self.judgement_categories = corpus.keys()\n        self.vocabulary_size\n        self.embedding_size\n        \n    def build_word2vec(self):\n        all_comments = []\n        all_words = []\n        category_inds = np.arange(len(self.judgements))\n        for category_ind in category_inds:\n            for comment_ind in np.arange(len(self.comment_corpus[self.judgements[category_ind]])):\n                all_comments.append(self.comment_corpus[self.judgements[category_ind]][comment_ind])\n                #for token in self.comment_corpus[self.judgements[category_ind]][comment_ind]:\n                #    all_words.append(token)\n\n        return(Word2Vec(all_comments, min_count=1))\n\n    def __len__(self):\n        return len(self.landmarks_frame)\n\n    def __getitem__(self, idx):\n        if torch.is_tensor(idx):\n       