In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import string

In [None]:
# read data
df = pd.read_csv('train.csv')
text = df['text']

In [None]:
# function to parse text punctuation in the set is considered its own word
def split_text(sentence):
    punctuation = set([".",",","?","!","-", ":", ";"])
    r = ''.join([c for c in sentence.lower()])
    ls = ''
    for c in r:
        if c in punctuation:
            ls+= ' ' + c + ' '
        else:
            ls+=c
    return ls.split()

In [None]:
# find top 5000 words
wordCount = defaultdict(int)
avglen = 0
for d in text:
    for w in split_text(d):
        wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:5000]]
wordId = dict(zip(words, range(len(words))))
idToWord = dict(zip(range(len(words)),words))
wordSet = set(words)

In [None]:
# process data
# removes all words not in the top 5000
parsed_text = []
for d in text:
    split = split_text(d)
    parsed_text.append([c for c in split if c in wordSet])

In [None]:
# read in glove model (don't do this if not using glove)
def loadGloveModel(gloveFile):
    f = open(gloveFile,'r', encoding='utf-8')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model
glove = loadGloveModel('glove50d.txt')

In [None]:
import torch.nn as nn
# some constants to set
seq_length = 10
embedding_length = 50
vocab_len = 5000
learning_rate = 0.01
# this is the loss function for the network (use crossentropy to get useable probabilities)
criterion = nn.CrossEntropyLoss()
num_layers = 3

In [None]:
# some helpful torch functions
import torch
# return the word id (uncomment the 2 lines if you want to use glove representation)
def word_tensor(word):
    tensor = []
    word = word.lower()
#     if word in glove:
#         tensor = torch.from_numpy(glove[word])
    if word in wordId:
        tensor = wordId[word]
    return tensor
# return a tensor representation of a sentence (if using glvoe tensor should be len(sentence),1 ,glove vec length)
# and use tensor[li][0] = word_tensor(word) instead
def sentence_tensor(sentence):
    tensor = torch.zeros(len(sentence),1,1, dtype=torch.long)
    for li, word in enumerate(sentence):    
        tensor[li] = word_tensor(word)
    return tensor
# given an output vector return the highest probability word
def wordFromOutput(output):
    top_n, top_i = output.topk(1)
    word = top_i[0].item()
    return idToWord[word]
# formats a training example into a tensor of sequence length glove vectors
def parseTrainingExample(i):
    target = train_target[i]
    sequence = train_sentences[i]
    sequence_tensor = sentence_tensor(sequence)
    # vector for computing loss
    target_tensor = torch.tensor([wordId[target]], dtype=torch.long)
    return sequence, target, sequence_tensor, target_tensor

In [None]:
# helper function to parse new sentences (make sure no words not in the model are used)
def parse_text(sentence):
    split = split_text(sentence)
    return [c for c in split if c in wordSet]

In [None]:
#some pre-processing to break the words up into a training matrix with sequence lengths of seq_length
train_sentences = []
#the next word in the sequence (word to be predicted)
train_target = []
for t in parsed_text: 
    for i in range(len(t)-seq_length):
        sequence = []
        for j in range(seq_length):
            sequence.append(t[i+j])
        # add the seq_length word sequence to the training examples
        train_sentences.append(sequence)
        # add the next word after sequence to targets for prediction
        train_target.append(t[i+seq_length])

In [None]:
# define the model
import torch.nn.functional as F
# if you want to use glove instead remove the embedding layer and just put the input directly into the lstm
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, vocab_size, embedding_dim):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size, hidden_size, dropout=.2,num_layers=num_layers)
        self.l2o = nn.Linear(hidden_size, vocab_size)

    def forward(self, input, hidden):
        embeds = self.word_embeddings(input)
        for i in range(seq_length):
            lstm_out, hidden = self.lstm(
                embeds[i], hidden)
        output = self.l2o(lstm_out[0][0])
        output = F.log_softmax(output, dim=-1)
        return output, hidden
    def initHidden(self):
        return (torch.zeros(num_layers, 1, self.hidden_size),torch.zeros(num_layers, 1, self.hidden_size))
rnn = RNN(embedding_length, embedding_length, vocab_len, num_layers, vocab_len, embedding_length)

# to load a pretrained model
# rnn.load_state_dict(torch.load('rnn_top5000_first_attempt'))
# rnn.eval()

In [None]:
# function to run one iteration of the network 
# text_tensor is a tensor representation of the sequence length
# hidden is a tensor representation of the hidden state of the lstm
# prediction_tensor is a length 1 tensor that conatins the word id of the target word
def train(text_tensor, prediction_tensor):
    #initialize the hidden state (zero it)
    hidden = rnn.initHidden()
    # zero the gradient
    rnn.zero_grad()

    # run the network over the sequence
    output, hidden = rnn(text_tensor, hidden)
    # compute the loss and calculate auto-grad
    loss = criterion(output.view(1,-1), prediction_tensor)
    loss.backward()
    
    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, hidden, loss.item()

In [None]:
import time
import math
import random

# some constants for the training step
n_epochs = 30
n_iters = len(train_sentences)
print_every = 500
plot_every = 1000

# Keep track of losses for plotting
current_loss = 0
all_losses = []

# get the time since last print
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()
for i in range(n_epochs):
    for iter in range(1,n_iters):

        sequence, target, sequence_tensor, target_tensor = parseTrainingExample(iter)
        output, hidden, loss = train(sequence_tensor, target_tensor)
        current_loss += loss

        # Print iter number, loss, name and guess
        if iter % print_every == 0:
            guess = wordFromOutput(output)
            correct = '✓' if guess == target else '✗ (%s)' % target
            print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, sequence, guess, correct))

        # Add current loss avg to list of losses
        if iter % plot_every == 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0

In [None]:
# this will save the model, use a different file name for different models
torch.save(rnn.state_dict(), 'filename')

In [None]:
#plot the avg loss over training time 
# this just shows if training acomplished anything
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

In [None]:
import math
# Just return a vector of probabilities of the next word given a sequence
def evaluate(sequence_tensor):
    hidden = rnn.initHidden()
    output = []
    output, hidden = rnn(sequence_tensor, hidden)
    return output

# this is the seed sentence should be of length embedding_len
input_sentence = ['The', 'details', 'were', 'examined', 'and', 'it', 'was', 'found', 'that', 'the']
vec = []
#this will generate 100 different sentences of length 10 + size of input sentence
for j in range(1):
    out = [d for d in input_sentence]
    for i in range(10):
        # get the probabilities
        with torch.no_grad():
            output = evaluate(sentence_tensor(out[i:i+seq_length]))
        # the probabilities are log probs so use the exponential function
        z = []
        for d in output:
            z.append(math.exp(d.item()))
        # due to floating point error the sum is usually 1 + epsilon for some tiny epsilon
        # the random choice function requires sum = 1 so we are gonna normalize the l1 norm of the vector to be 1
        sum = (np.sum(z))
        for index,item in enumerate(z):
            z[index] = item/sum
        # randomly sample the distribution and append the selected word
        choice = (np.random.choice(range(5000), p=z))
        out.append(idToWord[choice])
    vec.append(out)
    

In [None]:
vec

In [None]:
# save the text genreated
df = pd.DataFrame(vec)
df.to_csv("100_tests_3.csv", header=None)