In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
from collections import Counter
import pandas as pd
import random
import matplotlib.pyplot as plt
import itertools

# Load Data

## .tsv to list

In [2]:
def load_data(filename):
    file = pd.read_csv(filename, sep='\t')
    data = [[file.iloc[i,0],file.iloc[i,1]] for i in range(len(file.index))]
    file['label'] = file['label'].replace(['entailment', 'contradiction', 'neutral'], [0,1,2])
    labels = file['label']
#     file['genre'] = file['genre'].replace(['telephone', 'fiction', 'slate', 'government', 'travel'], [0,1,2,3,4])
#     genres = file['genre']
    return data, labels

In [3]:
train_data, train_targets = load_data('snli_train.tsv')
val_data, val_targets = load_data('snli_val.tsv')
file = pd.read_csv('snli_train.tsv', sep='\t')
file = file[:10000]
# file['label']= file['label'].replace(['entailment', 'contradiction', 'neutral'], [0,1,2])
# file.iloc[5,0]

In [4]:
train_data = train_data[:10000]
train_targets = train_targets[:10000]

In [5]:
len(train_data)


10000

## Tokenization

In [6]:
train_data_tokens = [[train_data[i][j].split() for j in range(2)] for i in range(len(train_data))]

In [7]:
val_data_tokens = [[val_data[i][j].split() for j in range(2)] for i in range(len(val_data))]

In [8]:
print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))

Train dataset size is 10000
Val dataset size is 1000


In [9]:
train_data_tokens[0]

[['A',
  'young',
  'girl',
  'in',
  'a',
  'pink',
  'shirt',
  'sitting',
  'on',
  'a',
  'dock',
  'viewing',
  'a',
  'body',
  'of',
  'water',
  '.'],
 ['A',
  'young',
  'girl',
  'watching',
  'the',
  'sunset',
  'over',
  'the',
  'water',
  '.']]

## Fasttext Word Embedding

In [10]:
with open('wiki-news-300d-1M.vec') as f:
    lines = []
    for i in range(50001):
        line = f.readline()
        v = line.split()
        for j in range(1,len(v)):
            v[j] = float(v[j])
        lines.append(v)
        
lines.remove(lines[0])  

In [11]:
word_dict = {}
word_dict['PAD'] = 0
word_dict['UNK'] = 1

embed = [[0 for i in range(300)],[0 for i in range(300)]]

for i in range(len(lines)):
    word_dict['PAD'] = 0
    word_dict['UNK'] = 1
    word_dict[lines[i][0]] = i+2
    embed.append(lines[i][1:])
    
embedding_matrix = np.matrix(embed)

id2token = []
for word in word_dict.keys():
    id2token.append(word)
token2id = word_dict

In [12]:
print(len(id2token))

50002


In [13]:
random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 26683 ; token handmade
Token handmade; token id 26683


In [14]:
id2token[106]

'A'

In [15]:
UNK_IDX = 1
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        sublist = []
        for i in range(2):
            index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens[i]]
            sublist.append(index_list)
        indices_data.append(sublist)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)

In [16]:
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))

Train dataset size is 10000
Val dataset size is 1000


In [17]:
train_data_indices[0]

[[106,
  802,
  1830,
  8,
  9,
  6265,
  7167,
  4388,
  17,
  9,
  12229,
  5335,
  9,
  563,
  6,
  358,
  4],
 [106, 802, 1830, 2255, 3, 13985, 94, 3, 358, 4]]

## Pytorch DataLoader

In [18]:
MAX_SENTENCE_LENGTH = 25
class SNLIDataset(Dataset):
    def __init__(self, data_list, target_list):
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx_1 = self.data_list[key][0][:MAX_SENTENCE_LENGTH]
        token_idx_2 = self.data_list[key][1][:MAX_SENTENCE_LENGTH]
        token_idx = [token_idx_1, token_idx_2]
        label = self.target_list[key]
        return [token_idx, [len(token_idx_1), len(token_idx_2)], label]

def SNLI_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []

    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0][0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1][0])), 
                                mode="constant", constant_values=0)
        padded_vec_2 = np.pad(np.array(datum[0][1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1][1])), 
                                mode="constant", constant_values=0)
#         sublist = ' '.join(padded_vec_2)
#         data_list.append(padded_vec_1)
#         data_list.append(padded_vec_2)
        data_list.append(list(padded_vec_1) + list(padded_vec_2))
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [19]:
BATCH_SIZE = 40
train_dataset = SNLIDataset(train_data_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)

val_dataset = SNLIDataset(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [20]:
for i, (data, lengths, labels) in enumerate(train_loader):
    print (len(data))
    print (lengths)
    break
print (data[0])

40
tensor([[12,  7],
        [11,  5],
        [25, 12],
        [16,  8],
        [12,  5],
        [11,  6],
        [ 8, 10],
        [20, 11],
        [ 8,  5],
        [ 9,  7],
        [24,  5],
        [25,  8],
        [21,  6],
        [ 7,  6],
        [17,  9],
        [21, 15],
        [ 7,  8],
        [15,  9],
        [17,  8],
        [ 9,  7],
        [ 6,  6],
        [ 9, 10],
        [11,  5],
        [10, 11],
        [ 7,  8],
        [20,  8],
        [11,  7],
        [ 9, 10],
        [ 7,  7],
        [ 8, 13],
        [24, 11],
        [15,  6],
        [14,  6],
        [20,  8],
        [12,  5],
        [14, 10],
        [25, 11],
        [ 7, 11],
        [17, 11],
        [12,  7]])
tensor([ 1442,   884,   355,     8,   291,   606,  4301,    34,    17,     9,
        14283,     4,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,  1442,   271,    34,    17,     9,
        14283,     4,     0,     0,     0,

In [21]:
weight = torch.FloatTensor(embed)
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.dropout = nn.Dropout(p=0.25)
#         self.rnn = nn.RNN(emb_size, hidden_size, num_layers, batch_first=True)#creating RNN in pytorch
        self.bi_gru1 = nn.GRU(emb_size, hidden_size, num_layers=1, batch_first=True,bidirectional=True)
        self.bi_gru2 = nn.GRU(emb_size, hidden_size, num_layers=1, batch_first=True,bidirectional=True)

        # 2 FC layers
        self.linear1 = nn.Linear(4 * hidden_size, 100)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(100, 3)


    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.zeros(self.num_layers*2, batch_size, self.hidden_size)
        return hidden

    def forward(self, x, lengths):
        # reset hidden state

        batch_size, seq_len = x.size()   
        #main part of rnn
        self.hidden1 = self.init_hidden(batch_size) # old size = batch_size 
        self.hidden2 = self.init_hidden(batch_size) # old size = batch_size 

        #######################################################
        # code here
        
        # get embedding of characters
        embed = self.embedding(x)
        embed = self.dropout(embed)
        embed.detach()
        # sort the sequence
        sorted_seq_lengths1, indices1 = torch.sort(lengths[:, 0], descending=True)
        sorted_seq_lengths2, indices2 = torch.sort(lengths[:, 1], descending=True)
        
        embed_1 = embed[:, :MAX_SENTENCE_LENGTH, :][indices1]
        embed_2 = embed[:, MAX_SENTENCE_LENGTH:, :][indices2]
        
        
        embed_1 = torch.nn.utils.rnn.pack_padded_sequence(embed_1, 
                                                          sorted_seq_lengths1.numpy(), 
                                                          batch_first=True)
        
        embed_2 = torch.nn.utils.rnn.pack_padded_sequence(embed_2, 
                                                          sorted_seq_lengths2.numpy(), 
                                                          batch_first=True)

        _, desorted_indices1 = torch.sort(indices1, descending=False)
        _, desorted_indices2 = torch.sort(indices2, descending=False)
        
        #bi-directional GRU
        bi_output1, self.hidden1 = self.bi_gru1(embed_1,self.hidden1)
        bi_output2, self.hidden2 = self.bi_gru2(embed_2,self.hidden2)
        # rearrange output into correct order
#         print(self.hidden1.size())
        self.hidden1 = self.hidden1[:, desorted_indices1, :]
        self.hidden2 = self.hidden2[:, desorted_indices2, :]

        
        bi_gru_out = torch.cat((torch.cat([self.hidden1[0], self.hidden1[1]], dim=-1), 
                                torch.cat([self.hidden2[0], self.hidden2[1]], dim=-1)), dim=1)

        # FC layers
        rnn_out = self.linear1(bi_gru_out)
        activate = self.activation(rnn_out)
        logits = self.dropout(activate)
        logits = self.linear2(logits)
        
        return logits

        #######################################################

In [93]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, lengths_batch), dim=1)
#         print(model(data_batch, lengths_batch))
#         print(outputs)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = RNN(emb_size=300, hidden_size=300, num_layers=1, num_classes=3, vocab_size=len(id2token)) # num_layers

learning_rate = 0.005
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
parameters = itertools.filterfalse(lambda p: p.requires_grad == False, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
# Train the model
total_step = len(train_loader)

loss_l = []
acc_l = []
for epoch in range(num_epochs):
    scheduler.step()
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data,lengths)
        loss = criterion(outputs, labels)
        if i % 64 == 0:
            loss_l.append(loss.item())

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            acc_l.append(val_acc)

Epoch: [1/10], Step: [101/2500], Validation Acc: 54.2
Epoch: [1/10], Step: [201/2500], Validation Acc: 52.9
Epoch: [1/10], Step: [301/2500], Validation Acc: 56.2
Epoch: [1/10], Step: [401/2500], Validation Acc: 58.1
Epoch: [1/10], Step: [501/2500], Validation Acc: 61.2
Epoch: [1/10], Step: [601/2500], Validation Acc: 61.8
Epoch: [1/10], Step: [701/2500], Validation Acc: 62.1
Epoch: [1/10], Step: [801/2500], Validation Acc: 59.9
Epoch: [1/10], Step: [901/2500], Validation Acc: 62.3
Epoch: [1/10], Step: [1001/2500], Validation Acc: 61.5
Epoch: [1/10], Step: [1101/2500], Validation Acc: 63.5
Epoch: [1/10], Step: [1201/2500], Validation Acc: 63.1
Epoch: [1/10], Step: [1301/2500], Validation Acc: 63.2
Epoch: [1/10], Step: [1401/2500], Validation Acc: 62.8
Epoch: [1/10], Step: [1501/2500], Validation Acc: 61.9
Epoch: [1/10], Step: [1601/2500], Validation Acc: 63.2
Epoch: [1/10], Step: [1701/2500], Validation Acc: 64.6
Epoch: [1/10], Step: [1801/2500], Validation Acc: 63.1
Epoch: [1/10], Step

Epoch: [7/10], Step: [801/2500], Validation Acc: 66.1
Epoch: [7/10], Step: [901/2500], Validation Acc: 68.0
Epoch: [7/10], Step: [1001/2500], Validation Acc: 68.0
Epoch: [7/10], Step: [1101/2500], Validation Acc: 68.0
Epoch: [7/10], Step: [1201/2500], Validation Acc: 67.0
Epoch: [7/10], Step: [1301/2500], Validation Acc: 67.5
Epoch: [7/10], Step: [1401/2500], Validation Acc: 67.9
Epoch: [7/10], Step: [1501/2500], Validation Acc: 67.2
Epoch: [7/10], Step: [1601/2500], Validation Acc: 68.1
Epoch: [7/10], Step: [1701/2500], Validation Acc: 67.2
Epoch: [7/10], Step: [1801/2500], Validation Acc: 68.1
Epoch: [7/10], Step: [1901/2500], Validation Acc: 66.8
Epoch: [7/10], Step: [2001/2500], Validation Acc: 68.3
Epoch: [7/10], Step: [2101/2500], Validation Acc: 67.2
Epoch: [7/10], Step: [2201/2500], Validation Acc: 68.2
Epoch: [7/10], Step: [2301/2500], Validation Acc: 67.0
Epoch: [7/10], Step: [2401/2500], Validation Acc: 67.1
Epoch: [8/10], Step: [101/2500], Validation Acc: 67.6
Epoch: [8/10]

KeyboardInterrupt: 

In [23]:
plt.plot(loss_l)
plt.show()
loss_l

NameError: name 'loss_l' is not defined

## Cases

In [22]:
model = RNN(emb_size=300, hidden_size=600, num_layers=1, num_classes=3, vocab_size=len(id2token))
model.load_state_dict(torch.load('rnnmodel.pt', map_location='cpu'))

In [25]:

def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    i = 0
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
        print('predicted:{}'.format(predicted))
        print('label:{}'.format(labels.view_as(predicted)))
        i +=1
        if i >=1:
            break
    return (100 * correct / total)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

val_acc = test_model(val_loader, model)

predicted:tensor([[1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [2],
        [1],
        [0],
        [2],
        [1],
        [0],
        [1],
        [2],
        [2],
        [2],
        [1],
        [2],
        [0],
        [2],
        [0],
        [1],
        [1],
        [2],
        [0],
        [0],
        [0],
        [2],
        [2],
        [2],
        [1],
        [0],
        [0],
        [1],
        [2],
        [1],
        [0],
        [2]])
label:tensor([[1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [2],
        [1],
        [0],
        [0],
        [2],
        [1],
        [0],
        [0],
        [2],
        [2],
        [0],
        [0],
        [0],
        [2],
        [0],
        [0],
        [1],
        [2],
        [1],
        [0],
        [0],
        [2],
        [2],
        [2],
        [1],
        [2],
        [1],
        

In [26]:
file = pd.read_csv('snli_val.tsv', sep='\t')
file = file[:10000]

In [30]:
file
# ['entailment', 'contradiction', 'neutral'], [0,1,2]

Unnamed: 0,sentence1,sentence2,label
0,"Three women on a stage , one wearing red shoes...",There are two women standing on the stage,contradiction
1,"Four people sit on a subway two read books , o...","Multiple people are on a subway together , wit...",entailment
2,bicycles stationed while a group of people soc...,People get together near a stand of bicycles .,entailment
3,Man in overalls with two horses .,a man in overalls with two horses,entailment
4,Man observes a wavelength given off by an elec...,The man is examining what wavelength is given ...,entailment
5,Two people are in a green forest .,The forest is not dead .,entailment
6,Two men are listening to music through headpho...,Two men listen to music .,entailment
7,"Two women , one walking her dog the other push...",There is a snowstorm .,contradiction
8,A group of numbered participants walk down the...,Participants wait for the beginning of the wal...,neutral
9,Three people and a white dog are sitting in th...,Three dogs and a person are sitting in the snow .,contradiction
