# All models discussed in the paper are the same

This implementation is using modelV3.py as the final revision of the model. V1 and V2 are also included with the code and are discussed in the paper.

## Imports

Lots of imports here, by section they are...

1. General imports, mainly for logging/reading
2. Pytorch imports for the model and its training/validation/testing
3. Imports from custom python files, such as the pytorch model, data processing, and dataset structuring
4. Matplotlib for plotting loss

In [None]:
import os
import yaml
import numpy as np
import pandas as pd
from datetime import datetime

import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.utils.data import DataLoader

from modelV3 import Siamese_lstm
from process import get_embedding, save_embed, load_embed
from process import data_process
from dataset import vocDS, VocabMaker

import matplotlib.pyplot as plt

In [None]:
config = yaml.full_load(open("config.yaml"))

## Data Preprocessing

1. Reads the dataset in its entirety
2. Cleans the sentences of any puctuation and puts all letters into lower case
3. Drops all numbering data from the dataframe, leaving only questions and labels
4. Splits the data 80:20 for training and testing
5. Based on inputs, grabs selected values from training and test set (This is because trianing on all ~430,000 question pairs wasn't feasible in the given time)


Note: The values within this function were 60,000 and 6,000 in the baseline models

In [None]:
""" Preprocess Data"""
data_process(100000, 20000)

In [None]:
""" Reading the Data written by the data_process function """

train_data = pd.read_csv('ctr.csv') #len 100000
test_data = pd.read_csv('ct.csv') #len 20000

# split dataset
msk = np.random.rand(len(train_data)) < 0.8
train = train_data[msk]
valid = train_data[~msk]

#getting all sentences to put into the vocab
all_sents = train_data['question1'].tolist() + train_data['question2'].tolist() + test_data['question1'].tolist() + test_data['question2'].tolist()
all_sents_test = (train_data['question1'].tolist() + train_data['question2'].tolist() + test_data['question1'].tolist() + 
            test_data['question2'].tolist() + test_data['question1'].tolist() + test_data['question2'].tolist())

#creates datasets with vocabulary
trainDS = vocDS(train, all_sents)
validDS = vocDS(valid, all_sents)
testDS = vocDS(test_data, all_sents_test)

#values should correspond with inputs to data_process above
print ('Testing data size', train_data.shape[0], test_data.shape[0])

## Embeddings

When the embedding is created it checks the pretrained embedding for each word in the vocab, if a vector for that word exists, it takes it, else it creates a new vector to assign to that word. During this process it also makes note of the percentage of words from the vocab are not covered by the pretrained embedding. The presaved embedding had an 82% vocab coverage.

In [None]:
config['model']['embedding_saved'] = not config['model']['embedding_saved']

In [None]:
if not config['model']['embedding_saved']:
    print("Creating embedding")
    full_embed_path = 'glove.6B.300d.txt'
    embed_dict = get_embedding(trainDS.vocab.id2word, full_embed_path)

    save_embed(embed_dict, 'embedding.txt')
    config['model']['embedding_saved'] = not config['model']['embedding_saved']
    
else:
    embed_dict = load_embed('embedding.txt')
    print("Loaded embedding")

In [None]:
vocab_size = len(embed_dict)
# initialize nn embedding
embedding = nn.Embedding(vocab_size, 300)
embed_list = []
for word in trainDS.vocab.id2word:
    embed_list.append(embed_dict[word])
weight_matrix = np.array(embed_list)
# pass weights to nn embedding
embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False)

# The Model

### Siamese LSTM Model

This implementation is a single layer lstm (due to computational bottlenecks) which will take as input two sentences. For each sentence it will create an encoding, and perform some operations on the two encodings and concatenate them to produce a long tensor. The tensor is then passed into a sequential double linear model as its classifier, which will take the now long tensor and get a length two sensor out as the model's prediction (To get a numerical prediction between 0 and 1, this tensor will be passed into a Softmax function).


#### Main Hyperparameters
1. Embedding size = 300
2. Hidden size = 50
3. Learning rate .01

In [None]:
#Model time 
siamese = Siamese_lstm(embedding)

In [None]:
criterion = torch.nn.CrossEntropyLoss()

learning_rate = config['training']['learning_rate']
optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate)

In [None]:
#Log stuff
train_log_string = '%s :: Epoch %i :: Iter %i / %i :: train loss: %0.4f'
valid_log_string = '%s :: Epoch %i :: valid loss: %0.4f\n'

loadBest = False

In [None]:
if loadBest:
    if os.path.exists('Sgd4Best.pt'):
        print('Loading checkpoint: %s' % 'Sgd4Best.pt')
        ckpt = torch.load('Sgd4Best.pt')
        epoch = ckpt['epoch']
        siamese.load_state_dict(ckpt['siamese'])
        optimizer.load_state_dict(ckpt['optimizer'])
        loss_states = ckpt['loss_states']
        print("Epoch: " + str(epoch))
else:
    if os.path.exists('Sgd4Long.pt'):
        print('Loading checkpoint: %s' % 'Sgd4Long.pt')
        ckpt = torch.load('Sgd4Long.pt')
        epoch = ckpt['epoch']
        siamese.load_state_dict(ckpt['siamese'])
        optimizer.load_state_dict(ckpt['optimizer'])
        loss_states = ckpt['loss_states']
        print("Epoch: " + str(epoch))
    else:
        epoch = 0
        loss_states = []
        print ('Fresh start!')

# Training

### For each epoch

1. Load our training dataset into torch DataLoader (each epoch the data is randomly ordered to prevent the model learning a patern)
2. For each question pair in the dataset we first get the words and labels
3. Clear the gradients
4. Pass our sentences through our Siamese Model
5. Compute the loss based on the label and the prediction
6. BackPropogate
7. Record loss states every 1000 or so iterations

## Validation

Same as above without the BackPropogation

## Saving Model

If the overall loss of a new state is less than the current record, save the model to thr Best save point, else save it to the Long save point

In [None]:
"""Training"""
    
# save every epoch for visualization
train_loss_record = []
valid_loss_record = []
best_record = 0.3809


while epoch < 40:

    print ('Start Epoch{} Training...'.format(epoch))

    # loss
    train_loss = []
    train_loss_sum = []
    # dataloader
    train_dataloader = DataLoader(dataset=trainDS, shuffle=True, num_workers=2, batch_size=1)

    for idx, data in enumerate(train_dataloader, 0):

        # get data
        s1, s2, label = data

        # clear gradients
        optimizer.zero_grad()

        # input
        output = siamese(s1, s2)
        output = output.squeeze(0)

        # loss backward
        loss = criterion(output, Variable(label))
        loss.backward()
        optimizer.step()
        train_loss.append(loss.data.cpu())
        train_loss_sum.append(loss.data.cpu())

        #save loss states for graphing 
        if ((idx + 1) % 4000) == 0:
            loss_states.append(train_loss)
            print(train_log_string % (datetime.now(), epoch, idx + 1, len(train), np.mean(train_loss)))
            train_loss = []


    # Record at every epoch
    print ('Train Loss at epoch{}: {}\n'.format(epoch, np.mean(train_loss_sum)))
    train_loss_record.append(np.mean(train_loss_sum))

    # Valid
    print ('Epoch{} Validating...'.format(epoch))

    # loss
    valid_loss = []
    # dataloader
    valid_dataloader = DataLoader(dataset=validDS, shuffle=True, num_workers=2, batch_size=1)

    for idx, data in enumerate(valid_dataloader, 0):
        # get data
        s1, s2, label = data

        # input
        output = siamese(s1, s2)
        output = output.squeeze(0)

        # loss
        loss = criterion(output, Variable(label))
        valid_loss.append(loss.data.cpu())

    print(valid_log_string % (datetime.now(), epoch, np.mean(valid_loss)))
    # Record
    valid_loss_record.append(np.mean(valid_loss))

    epoch += 1

    # Keep track of best record
    if np.mean(valid_loss) < best_record:
        best_record = np.mean(valid_loss)
        # save the best model
        state_dict = {
            'epoch': epoch,
            'siamese': siamese.state_dict(),
            'optimizer': optimizer.state_dict(),
            'loss_states': loss_states
        }
        torch.save(state_dict, 'Sgd4Best.pt')
        print ('Model improved!\n')

    # save the longest running model
    state_dict = {
        'epoch': epoch,
        'siamese': siamese.state_dict(),
        'optimizer': optimizer.state_dict(),
        'loss_states': loss_states
    }
    torch.save(state_dict, 'Sgd4Long.pt')
    print ('Model saved!\n')

## Testing

1. Using a softmax function to get out the prediction from our model, since the output is length 2
2. Since the predictions are a continuous value, they are mapped to either 0 or 1 based on if they are less that .5 or not
3. Next all true/false positive/negatives are calculated

In [None]:
""" Testing """

#load testing data
dL = DataLoader(dataset=testDS, shuffle=True, num_workers=2, batch_size=1)
#using softmax to get prediction from the output
sm = nn.Softmax(dim=1)


#save values to use for precision and recall
true_pos = 0
true_neg = 0
false_pos = 0
false_neg = 0

#test
for idx, data in enumerate(dL, 0):
    # get data
    s1, s2, label = data

    # input
    output = siamese(s2, s1)
    output = output.squeeze(0)
    res = sm(output.data)[:,1]
    
    predict = None
    if res < .5:
        predict = 0
    else:
        predict = 1
        
    if label == 1 and predict == 1:
        true_pos += 1
    elif label == 0 and predict == 1:
        false_pos += 1
    elif label == 1 and predict == 0:
        false_neg += 1
    elif label == 0 and predict == 0:
        true_neg += 1
    else: 
        print("Not a valid prediction/label")

## Statistics

1. Displayed below are results at different epochs and a graph of the loss states for the indicated epochs

In [None]:
recall = true_pos/(true_pos + false_neg)
precision = true_pos/(true_pos + false_pos)
accuracy = (true_pos + true_neg)/len(testDS)
f1 = 2 * ((precision * recall)/(precision + recall))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("Accuracy: " + str(accuracy))
print("F1 score: " + str(f1))

In [None]:
loss_means = [np.mean(loss_states[i]) for i in range(len(loss_states))]
x = [i for i in range(len(loss_states))]

plt.plot(x, loss_means)
plt.title("Mean training loss over length of " + str(epoch - 1) + " epochs")
plt.ylabel("Loss")
plt.show()