<a href="https://colab.research.google.com/github/JinHAN7/1011_HW2/blob/master/Copy_of_NLP_HW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preparation and Preprocessing 
- Install pytoch 0.4.1
- Cuda version: 9.2
- Load data and preprocessing for training

In [0]:
# NVIDIA profiling tool for the available GPU
!nvidia-smi

In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
# Use PyTorch to check versions, CUDA version and cuDNN

import torch

print("PyTorch version: ")
print(torch.__version__)
print("CUDA Version: ")
print(torch.version.cuda)
print("cuDNN version is: ")
print(torch.backends.cudnn.version())

In [0]:
!nvidia-smi

In [0]:
#connect with google drive to load pre-trained embedding 
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#load necessary library
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle as pkl

In [0]:
# Load pre-trained embedding and add <pad> and <unk>
PAD_IDX = 0
UNK_IDX = 1
def load_ft(words_to_load):
    with open('/content/drive/My Drive/Colab Notebooks/wiki-news-300d-1M.vec') as f:
        loaded_embeddings = np.zeros((words_to_load+2, 300))
        token2id = {}
        token2id['<pad>'] = PAD_IDX 
        token2id['<unk>'] = UNK_IDX
        id2token = []
        
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i+2, :] = np.asarray(s[1:])
            id2token.append(s[0])
            token2id[s[0]] = i+2
    id2token = ['<pad>', '<unk>'] + id2token
    return token2id, id2token, loaded_embeddings

In [0]:
# call load_ft to load fast_text embedding
token2id, id2token,ft_emb = load_ft(500000)

In [0]:
token2id['<unk>']

In [0]:
token2id['UNK']

In [0]:
token2id['unk']

In [0]:
token2id['UNK']

In [0]:
def df2idx(fname):
    df = pd.read_csv(fname, sep="\t", index_col=False )
    # change the label to numerical value
    df.loc[df['label'] == 'entailment', 'label'] = 0
    df.loc[df['label'] == 'contradiction', 'label'] = 1
    df.loc[df['label'] == 'neutral', 'label'] = 2
    # convert token to idx
    df['sent1_idx']  = df.apply (lambda row:[token2id[token] if token in token2id else token2id['UNK'] for token in row.sentence1.split()],axis=1)
    df['sent2_idx']  = df.apply (lambda row:[token2id[token] if token in token2id else token2id['UNK'] for token in row.sentence2.split()],axis=1)
    # convert df to data list and label list
    indexed_data = list(zip(df.sent1_idx, df.sent2_idx))
    label = np.array(df.label)
    return indexed_data, label, df

In [0]:
val_data, val_targets, val_df = df2idx("/content/snli_val.tsv")
train_data, train_targets, train_df = df2idx("/content/snli_train.tsv")

In [0]:
# Get a subset of training set to calculate the training accuracy
import random
subset_idx = random.sample(range(100000), 10000)
subset_train_data = [train_data[i] for i in subset_idx]
subset_train_targets = [train_targets[i] for i in subset_idx]

In [0]:
# Get the max length of sentence in training set
print('The max length of sentence 1 is {}'.format(max([len(train_data[i][0]) for i in range(len(train_data))])))
print('The max length of sentence 2 is {}'.format(max([len(train_data[i][1]) for i in range(len(train_data))])))


In [0]:
MAX_SENTENCE1_LENGTH = 82
MAX_SENTENCE2_LENGTH = 41

import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token1_idx = self.data_list[key][0][:MAX_SENTENCE1_LENGTH]
        token2_idx = self.data_list[key][1][:MAX_SENTENCE2_LENGTH]
        label = self.target_list[key]
        return [token1_idx, token2_idx, len(token1_idx),len(token2_idx),label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    token1_data_list = []
    token2_data_list = []
    label_list = []
    token1_length_list = []
    token2_length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[4])
        token1_length_list.append(datum[2])
        token2_length_list.append(datum[3])
    # padding
    for datum in batch:
        token1_padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE1_LENGTH-datum[2])), 
                                mode="constant", constant_values=0)
        token2_padded_vec = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_SENTENCE2_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        token1_data_list.append(token1_padded_vec)
        token2_data_list.append(token2_padded_vec)
    return [torch.from_numpy(np.array(token1_data_list)), torch.LongTensor(token1_length_list),
            torch.from_numpy(np.array(token2_data_list)), torch.LongTensor(token2_length_list),
            torch.LongTensor(label_list)]


In [0]:
BATCH_SIZE = 32
val_dataset = NewsGroupDataset(val_data, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

train_dataset =  NewsGroupDataset(train_data, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)
subset_train_dataset =  NewsGroupDataset(subset_train_data, subset_train_targets)
subset_train_loader = torch.utils.data.DataLoader(dataset=subset_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

In [0]:
class RNN(nn.Module):
    def __init__(self, hidden_size, num_layers, num_classes, pre_trained_emb):
        # RNN Accepts the following hyperparams:
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # pre_trained_emb : pre_trained embedding matrix. The shape of it can provide the embedding size and vocabulay size
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        # embedding module
        self.embedding = nn.Embedding(pre_trained_emb.shape[0], pre_trained_emb.shape[1], padding_idx=PAD_IDX)
        # create bi-directional GRU in pytorch(batch_first: the first dim is batch, 2nd is sequence dim, 3rd is embedding dim)
        self.rnn = nn.GRU(pre_trained_emb.shape[1], hidden_size,num_layers, bidirectional=True, batch_first = True) 
        # create decoder layer 
        self.linear1 = nn.Linear(hidden_size*4,hidden_size )
        self.linear2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        
    def init_weights(self, is_static=True):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pre_trained_emb).float())
        if is_static:
            self.embedding.weight.requires_grad = False
    
    
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(2 * self.num_layers, batch_size, self.hidden_size)
        
        return hidden

    def forward(self, token1_data, token1_lengths, token2_data, token2_lengths):
        # reset hidden state

        batch_size, token1_seq_len = token1_data.size()
        token2_seq_len = token2_data.size()[1]
#### main part of RNN ###########################
        self.hidden = self.init_hidden(batch_size)
        #get the sorted index based on sentent length
        _, token1_idx_sort = torch.sort(token1_lengths, dim=0, descending=True)
        _, token1_idx_unsort = torch.sort(token1_idx_sort, dim=0)
        token1_lengths = token1_lengths[token1_idx_sort]
        _, token2_idx_sort = torch.sort(token2_lengths, dim=0, descending=True)
        _, token2_idx_unsort = torch.sort(token2_idx_sort, dim=0)
        token2_lengths = token2_lengths[token2_idx_sort]
        # Sort input data
        token1_rnn = token1_data.index_select(0, token1_idx_sort)
        token2_rnn = token2_data.index_select(0, token2_idx_sort)
        
        # get embedding of two sentences
        embed_sent1 = self.embedding(token1_rnn)
        embed_sent2 = self.embedding(token2_rnn)
        
        # pack padded sequence
        # transform the tensor in pytorch into the padded sequence . pytorch want the sequence in the descending order
        embed_sent1 = torch.nn.utils.rnn.pack_padded_sequence(embed_sent1, token1_lengths, batch_first=True)
        embed_sent2 = torch.nn.utils.rnn.pack_padded_sequence(embed_sent2, token2_lengths, batch_first=True)
        use_cuda = True
        if use_cuda and torch.cuda.is_available():
            self.hidden = self.hidden.cuda()
            
        # fprop though RNN # the rnn_out varaible is size of batch size by the sequence length by the hidden dimension
#         rnn_out1, _ = self.rnn(embed_sent1, self.hidden) 
#         rnn_out2, _ = self.rnn(embed_sent2, self.hidden) 
        _, hidden_out1 = self.rnn(embed_sent1, self.hidden) 
        _, hidden_out2 = self.rnn(embed_sent2, self.hidden) 
        hidden_out1 = torch.cat((hidden_out1[0], hidden_out1[1]),dim = 1)
        hidden_out2 = torch.cat((hidden_out2[0], hidden_out2[1]),dim = 1)
        #unsort 
        hidden_out1 = hidden_out1.index_select(0, token1_idx_unsort)
        hidden_out2 = hidden_out2.index_select(0, token2_idx_unsort)
        # undo packing
#         rnn_out1, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out1, batch_first=True)
#         rnn_out2, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out2, batch_first=True)
#         #unsort
#         rnn_out1 = rnn_out1.index_select(0, token1_idx_unsort)
#         rnn_out2 = rnn_out2.index_select(0, token2_idx_unsort)
        # concatenate two encoded sentences
        out_cat = torch.cat((hidden_out1, hidden_out2), dim = 1)
        # sum hidden activations of RNN across time
        #out_cat = torch.sum(out_cat, dim=1)
####### main part #########################
        hidden1 = self.linear1(out_cat)
        hidden1 = self.relu(hidden1)
        out = self.linear2(hidden1)
        preds = F.log_softmax(out, 1)
        return preds


In [0]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, lengths1,data2, lengths2, labels in loader:
#         data1 = Variable(data1)  
#         lengths1 = Variable(lengths1)
#         data2 = Variable(data2)  
#         lengths2 = Variable(lengths2)# Convert torch tensor to Variable: change image from a vector of size 784 to a matrix of 28 x 28
#         labels = Variable(labels)
        if use_cuda and torch.cuda.is_available():
            data1 = data1.cuda()
            lengths1  = lengths1.cuda()
            data2 = data2.cuda()
            lengths2  = lengths2.cuda()
            labels = labels.cuda()
        data1_batch, lengths1_batch,data2_batch, lengths2_batch, label_batch = data1, lengths1, data2, lengths2,labels
        outputs =model(data1_batch, lengths1_batch,data2_batch, lengths2_batch)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [0]:
def train_model(loader, model):
  criterion = torch.nn.NLLLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  total_step = len(loader)
  #train_loss_ls = []
  val_acc_ls = []
  train_acc_ls = []
  for epoch in range(num_epochs):
      #loss_batch = []
      for i, (data1, lengths1, data2, lengths2, labels) in enumerate(loader):
  #         data1 = Variable(data1)  
  #         lengths1 = Variable(lengths1)
  #         data2 = Variable(data2)  
  #         lengths2 = Variable(lengths2)# Convert torch tensor to Variable: change image from a vector of size 784 to a matrix of 28 x 28
  #         labels = Variable(labels)
          if use_cuda and torch.cuda.is_available():
              data1 = data1.cuda()
              lengths1  = lengths1.cuda()
              data2 = data2.cuda()
              lengths2  = lengths2.cuda()
              labels = labels.cuda()
          model.train()
          optimizer.zero_grad()
          # Forward pass
          outputs = model(data1, lengths1, data2, lengths2)
          predicted = outputs.max(1, keepdim=True)[1]
          loss = criterion(outputs, labels)
          #loss_batch.append(loss.item())
          # Backward and optimize
          loss.backward()
          optimizer.step()
          # validate every 100 iterations
          if i > 0 and i % 400 == 0:
              # validate
              #train_loss = loss_batch[i]
              val_acc = test_model(val_loader, model)
              train_acc = test_model(subset_train_loader, model)
              train_acc_ls.append(train_acc)
              #train_loss_ls.append(train_loss)
              val_acc_ls.append(val_acc)
              print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Training Acc: {}'.format(
                         epoch+1, num_epochs, i+1, len(loader), val_acc, train_acc))
  #torch.save(model_object.state_dict(), 'params_{}.pkl'.format())
#model_object.load_state_dict(torch.load('params.pkl'))
  return  val_acc_ls, train_acc_ls
  #return train_loss_ls, val_acc_ls, train_acc_ls



# 2. RNN Tuning

## 2.1 Hidden Size Tuning for RNN
- Ways of interacting two sentences: Concatenation 
- Learning_rate 3e-4
- Number of Epochs: 10
- Embedding Weights: Freeze All
- hidden size list (50, 100, 200, 300, 400 )

In [0]:
learning_rate = 3e-4
num_epochs = 10 # number epoch to train
use_cuda = True

def find_best_hiddensize_RNN(hidden_ls):
  performance = {}
  for hidden_size in hidden_ls:
    print('---------RNN_HIDDEN_SIZE: {}-------------'.format(hidden_size))
    model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
      model.cuda()
     # Criterion and Optimizer
    #
    
    train_loss, val_acc = train_model(train_loader,model)
    performance['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)
    torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
  return performance

In [0]:
# Can call this function directly while it takes some time. 
# Since the runtime on google colab is often disconnected for a long-time run, I chose to run them one by one just in case. 

hidden_ls = [50,100,200,300,400]
hidden_record = find_best_hiddensize_RNN(hidden_ls)


In [0]:
hidden_size = 50
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
trainacc_performance['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'RNN_hidden_size_50.pkl' 'drive/My Drive/Colab Notebooks/RNN_hidden_size_50.pkl'

In [0]:
performance = {}
hidden_size = 50
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
train_loss, val_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
performance['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
trainacc_performance = {}
hidden_size = 100
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
train_loss, val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
trainacc_performance['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc,train_acc)

In [0]:
!cp 'RNN_hidden_size_100.pkl' 'drive/My Drive/Colab Notebooks/RNN_hidden_size_100.pkl'

In [0]:
#performance = {}
hidden_size = 100
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
train_loss, val_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
performance['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
hidden_size = 200
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
train_loss, val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
trainacc_performance['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc,train_acc)
!cp 'RNN_hidden_size_200.pkl' 'drive/My Drive/Colab Notebooks/RNN_hidden_size_200.pkl'

In [0]:
hidden_size = 200
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
train_loss, val_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
performance['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
hidden_size = 300
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
trainacc_performance['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'RNN_hidden_size_300.pkl' 'drive/My Drive/Colab Notebooks/RNN_hidden_size_300.pkl'

In [0]:
hidden_size = 300
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
train_loss, val_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
performance['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
hidden_size = 400
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
trainacc_performance['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'RNN_hidden_size_400.pkl' 'drive/My Drive/Colab Notebooks/RNN_hidden_size_400.pkl'

In [0]:
import pickle
f = open("rnn_hidden_size_record_new.pkl","wb")
pickle.dump(trainacc_performance,f)
f.close()

In [0]:
from google.colab import files
files.download('rnn_hidden_size_record_new.pkl') 

In [0]:
hidden_size = 400
model = RNN(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
train_loss, val_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'RNN_hidden_size_{}.pkl'.format(hidden_size))
performance['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
files.download('RNN_hidden_size_50.pkl')
files.download('RNN_hidden_size_100.pkl') 
files.download('RNN_hidden_size_200.pkl') 
files.download('RNN_hidden_size_300.pkl')
files.download('RNN_hidden_size_400.pkl')

## 2.2 Concatenate two encoded sentences with element-wise multiplication for RNN
- Instead of concatenation of two encoded sentences, do element-wise multiplication
- Freeze all embedding weights
- Also tuning hidden size
- Hidden size list (100, 200, 300, 400, 800)

In [0]:
class RNN_mul(nn.Module):
    def __init__(self, hidden_size, num_layers, num_classes, pre_trained_emb):
        # RNN Accepts the following hyperparams
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # pre_trained_emb: pre_trained fast text results
        super(RNN_mul, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        # embedding module
        self.embedding = nn.Embedding(pre_trained_emb.shape[0], pre_trained_emb.shape[1], padding_idx=PAD_IDX)
        # create RNN in pytorch(batch_first: the first dim is batch, 2nd is sequence dim, 3rd is embedding dim)
        self.rnn = nn.GRU(pre_trained_emb.shape[1], hidden_size,num_layers, bidirectional=True, batch_first = True) 
        
        self.linear1 = nn.Linear(hidden_size*2,hidden_size )
        self.linear2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    def init_weights(self, is_static=True):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pre_trained_emb).float())
        if is_static:
            self.embedding.weight.requires_grad = False
    
    
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(2 * self.num_layers, batch_size, self.hidden_size)
        
        return hidden

    def forward(self, token1_data, token1_lengths, token2_data, token2_lengths):
        # reset hidden state

        batch_size, token1_seq_len = token1_data.size()
        token2_seq_len = token2_data.size()[1]
#### main part of RNN ###########################
        self.hidden = self.init_hidden(batch_size)
        #sort 
        _, token1_idx_sort = torch.sort(token1_lengths, dim=0, descending=True)
        _, token1_idx_unsort = torch.sort(token1_idx_sort, dim=0)
        token1_lengths = token1_lengths[token1_idx_sort]
        _, token2_idx_sort = torch.sort(token2_lengths, dim=0, descending=True)
        _, token2_idx_unsort = torch.sort(token2_idx_sort, dim=0)
        token2_lengths = token2_lengths[token2_idx_sort]
        # Sort x
        token1_rnn = token1_data.index_select(0, token1_idx_sort)
        token2_rnn = token2_data.index_select(0, token2_idx_sort)
        
        # get embedding of characters
        embed_sent1 = self.embedding(token1_rnn)
        embed_sent2 = self.embedding(token2_rnn)
        # pretrained_weight is a numpy matrix of shape (num_embeddings, embedding_dim)
        #embed.weight = nn.Parameter(torch.from_numpy(pre_trained_emb))
        
        #embed = m * embed + (1-m) * embed.clone().detch()
       # embed.weight.data.copy_(torch.from_numpy(pre_trained_emb))
        # pack padded sequence
        # transform the tensor in pytorch into the padded sequence . pytorch want the sequence in the descending order
        embed_sent1 = torch.nn.utils.rnn.pack_padded_sequence(embed_sent1, token1_lengths, batch_first=True)
        embed_sent2 = torch.nn.utils.rnn.pack_padded_sequence(embed_sent2, token2_lengths, batch_first=True)
        use_cuda = True
        if use_cuda and torch.cuda.is_available():
#           embed_sent1 = embed_sent1.cuda()
#           embed_sent2 = embed_sent1.cuda()
            self.hidden = self.hidden.cuda()
            
        # fprop though RNN # the rnn_out varaible is size of batch size by the sequence length by the hidden dimension
#         rnn_out1, _ = self.rnn(embed_sent1, self.hidden) 
#         rnn_out2, _ = self.rnn(embed_sent2, self.hidden) 
        _, hidden_out1 = self.rnn(embed_sent1, self.hidden) 
        _, hidden_out2 = self.rnn(embed_sent2, self.hidden) 
        hidden_out1 = torch.cat((hidden_out1[0], hidden_out1[1]),dim = 1)
        hidden_out2 = torch.cat((hidden_out2[0], hidden_out2[1]),dim = 1)
        #unsort 
        hidden_out1 = hidden_out1.index_select(0, token1_idx_unsort)
        hidden_out2 = hidden_out2.index_select(0, token2_idx_unsort)
       
        out_cat = torch.mul(hidden_out1, hidden_out2)
        # sum hidden activations of RNN across time
        #out_cat = torch.sum(out_cat, dim=1)
####### main part #########################
        hidden1 = self.linear1(out_cat)
        hidden1 = self.relu(hidden1)
        out = self.linear2(hidden1)
        preds = F.log_softmax(out, 1)
        return preds


In [0]:
rnn_mul_trainacc = {}
hidden_size = 300
model = RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'mul_RNN_hidden_size_{}.pkl'.format(hidden_size))
rnn_mul_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'mul_RNN_hidden_size_300.pkl' 'drive/My Drive/Colab Notebooks/mul_RNN_hidden_size_300.pkl'

In [0]:
record_multiply_rnn = {}
hidden_size = 300
model_mul = RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_mul.cuda()
train_loss, val_acc = train_model(train_loader,model_mul)
torch.save(model_mul.state_dict(), 'RNN_mul_hidden_size_{}.pkl'.format(hidden_size))
record_multiply_rnn['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
# The validation accuracy of RNN model with multipliation encoded sentences when hidden size 300
test_model(val_loader, model_mul)

In [0]:
g = open("rnn_mul_hidden_size300_record.pkl","wb")
pkl.dump(record_multiply_rnn,g)
g.close()

In [0]:
from google.colab import files
files.download('rnn_mul_hidden_size300_record.pkl')

In [0]:
!cp 'RNN_mul_hidden_size_300.pkl' 'drive/My Drive/Colab Notebooks'

In [0]:
rnn_mul_trainacc = {}
hidden_size = 400
model = RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'mul_RNN_hidden_size_{}.pkl'.format(hidden_size))
rnn_mul_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'mul_RNN_hidden_size_400.pkl' 'drive/My Drive/Colab Notebooks/mul_RNN_hidden_size_400.pkl'

In [0]:
import pickle
f = open("rnn_mul_hidden_new1.pkl","wb")
pickle.dump(rnn_mul_trainacc,f)
f.close()

In [0]:
from google.colab import files
files.download('rnn_mul_hidden_new1.pkl')

In [0]:
rnn_mul_trainacc.keys()

In [0]:
hidden_size = 400
model_mul = RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_mul.cuda()
train_loss, val_acc = train_model(train_loader,model_mul)
torch.save(model_mul.state_dict(), 'RNN_mul_hidden_size_{}.pkl'.format(hidden_size))
record_multiply_rnn['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
#val acc of hiddensize 400 with multiplication concat
test_model(val_loader, model_mul)

In [0]:
g = open("rnn_mul_record2.pkl","wb")
pkl.dump(record_multiply_rnn,g)
g.close()
from google.colab import files
files.download('rnn_mul_record2.pkl')

In [0]:
!cp 'RNN_mul_hidden_size_400.pkl' 'drive/My Drive/Colab Notebooks'

In [0]:
hidden_size = 200
model = RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'mul_RNN_hidden_size_{}.pkl'.format(hidden_size))
rnn_mul_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'mul_RNN_hidden_size_200.pkl' 'drive/My Drive/Colab Notebooks/mul_RNN_hidden_size_200.pkl'


In [0]:
import pickle
f = open("rnn_mul_hidden_new2.pkl","wb")
pickle.dump(rnn_mul_trainacc,f)
f.close()

In [0]:
files.download('rnn_mul_hidden_new2.pkl')

In [0]:
hidden_size = 200
model_mul_200= RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_mul_200.cuda()
train_loss, val_acc = train_model(train_loader,model_mul_200)
torch.save(model_mul_200.state_dict(), 'RNN_mul_hidden_size_{}.pkl'.format(hidden_size))
record_multiply_rnn['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
#val acc of hiddensize 200 with multiplication concat
test_model(val_loader, model_mul_200)

In [0]:
g = open("rnn_mul_record3.pkl","wb")
pkl.dump(record_multiply_rnn,g)
g.close()
from google.colab import files
files.download('rnn_mul_record3.pkl')

In [0]:
!cp 'RNN_mul_hidden_size_200.pkl' 'drive/My Drive/Colab Notebooks'

In [0]:
files.download('RNN_mul_hidden_size_200.pkl')

In [0]:
hidden_size = 100
model = RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'mul_RNN_hidden_size_{}.pkl'.format(hidden_size))
rnn_mul_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'mul_RNN_hidden_size_100.pkl' 'drive/My Drive/Colab Notebooks/mul_RNN_hidden_size_100.pkl'


In [0]:
import pickle
f = open("rnn_mul_hidden_new3.pkl","wb")
pickle.dump(rnn_mul_trainacc,f)
f.close()

In [0]:
files.download("rnn_mul_hidden_new3.pkl")

In [0]:
hidden_size = 100
model_mul_100= RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_mul_100.cuda()
train_loss, val_acc = train_model(train_loader,model_mul_100)
torch.save(model_mul_100.state_dict(), 'RNN_mul_hidden_size_{}.pkl'.format(hidden_size))
record_multiply_rnn['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
#val acc of hiddensize 100 with multiplication concat
test_model(val_loader, model_mul_100)

In [0]:
g = open("rnn_mul_record4.pkl","wb")
pkl.dump(record_multiply_rnn,g)
g.close()
from google.colab import files
files.download('rnn_mul_record4.pkl')

In [0]:
!cp 'RNN_mul_hidden_size_100.pkl' 'drive/My Drive/Colab Notebooks'

In [0]:
files.download('RNN_mul_hidden_size_100.pkl')

In [0]:
learning_rate = 3e-4
num_epochs = 10 # number epoch to train
use_cuda = True

In [0]:
hidden_size = 800
model = RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'mul_RNN_hidden_size_{}.pkl'.format(hidden_size))
rnn_mul_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'mul_RNN_hidden_size_800.pkl' 'drive/My Drive/Colab Notebooks/mul_RNN_hidden_size_800.pkl'


In [0]:
import pickle
f = open("rnn_mul_hidden_new4.pkl","wb")
pickle.dump(rnn_mul_trainacc,f)
f.close()

In [0]:
files.download("rnn_mul_hidden_new4.pkl")

In [0]:
record_mul_800 = {}
hidden_size = 800
model_mul_800= RNN_mul(hidden_size = hidden_size, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_mul_800.cuda()
train_loss, val_acc = train_model(train_loader,model_mul_800)
torch.save(model_mul_800.state_dict(), 'drive/My Drive/Colab Notebook/RNN_mul_hidden_size_{}.pkl'.format(hidden_size))
record_mul_800['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
torch.save(model_mul_800.state_dict(), 'RNN_mul_hidden_size_{}.pkl'.format(hidden_size))


In [0]:
!cp 'RNN_mul_hidden_size_800.pkl' 'drive/My Drive/Colab Notebooks'

In [0]:
record_mul_800['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
#val acc of hiddensize 800 with multiplication concat
val_acc_rnn_mul800 = test_model(val_loader, model_mul_800)
val_acc_rnn_mul800

In [0]:
rnn_mul_100 = RNN_mul(hidden_size = 100,  num_layers = 1, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  rnn_mul_100.cuda()
rnn_mul_100.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/RNN_mul_hidden_size_100.pkl'))
test_model(val_loader, rnn_mul_100)


In [0]:
g = open("rnn_mul_800_record.pkl","wb")
pkl.dump(record_mul_800,g)
g.close()
from google.colab import files
files.download('rnn_mul_800_record.pkl')

# 3. CNN Tuning

## 3.1 Hidden Size Tuning for CNN
- Ways of interacting two encoded sentences: concatenation
- Weights of embedding : freeze all
- Kernel size: 3
- Hidden size list: (50, 100, 200, 300, 400, 800)

In [0]:
class CNN(nn.Module):
    def __init__(self,  hidden_size, kernel_size, padding_size, num_layers, num_classes, pre_trained_emb):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size, self.kernel_size ,self.padding_size= num_layers, hidden_size,kernel_size,padding_size
        self.embedding = nn.Embedding(pre_trained_emb.shape[0], pre_trained_emb.shape[1], padding_idx=PAD_IDX)
        #emb_size is the size of imput, hidden_size is the size of output. kernel_size is like the window size, 
        # the kernel size 3 here means read 3 words/chars once
        self.conv1 = nn.Conv1d( pre_trained_emb.shape[1], hidden_size, kernel_size, padding=padding_size)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size, padding=padding_size)

        self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, num_classes)
        #self.maxpooling = nn.MaxPool1d()
    def init_weights(self, is_static=True):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pre_trained_emb).float())
        if is_static:
            self.embedding.weight.requires_grad = False
            
    def forward(self, token1_data, token1_lengths,token2_data, token2_lengths):
        batch_size, token1_seq_len = token1_data.size()
        _,token2_seq_len = token2_data.size()

        embed_sent1 = self.embedding(token1_data)
        embed_sent2 = self.embedding(token2_data)
        # the convolusional module in pytorch expects the input of size  batch size by the hidden size by the sequence length
        hidden_sent1 = self.conv1(embed_sent1.transpose(1,2)).transpose(1,2)
        hidden_sent2 = self.conv1(embed_sent2.transpose(1,2)).transpose(1,2)
        # relu expect 2-d tensor as input , merging the 0th and 1st dim together
        hidden_sent1 = F.relu(hidden_sent1.contiguous().view(-1, hidden_sent1.size(-1))).view(batch_size, token1_seq_len, hidden_sent1.size(-1))
        hidden_sent2 = F.relu(hidden_sent2.contiguous().view(-1, hidden_sent2.size(-1))).view(batch_size, token2_seq_len, hidden_sent2.size(-1))
        hidden_sent1 = self.conv2(hidden_sent1.transpose(1,2)).transpose(1,2)
        hidden_sent2 = self.conv2(hidden_sent2.transpose(1,2)).transpose(1,2)
        hidden_sent1 = F.relu(hidden_sent1.contiguous().view(-1, hidden_sent1.size(-1))).view(batch_size, token1_seq_len, hidden_sent1.size(-1))
        hidden_sent2 = F.relu(hidden_sent2.contiguous().view(-1, hidden_sent2.size(-1))).view(batch_size, token2_seq_len, hidden_sent2.size(-1))
        # max-pooling over time
        hidden_sent1 = F.max_pool1d(hidden_sent1.transpose(1,2), kernel_size = token1_seq_len ).transpose(1,2)
        hidden_sent2 = F.max_pool1d(hidden_sent2.transpose(1,2), kernel_size = token2_seq_len ).transpose(1,2)
        hidden = torch.cat((hidden_sent1, hidden_sent2), dim=2).squeeze()
        fc1_out = self.linear1(hidden)
        fc1_out = self.relu(fc1_out)
        fc2_out = self.linear2(fc1_out)
        preds = F.log_softmax(fc2_out, 1)
        return preds
        

In [0]:
learning_rate = 3e-4
num_epochs = 10 # number epoch to train
use_cuda = True

In [0]:
cnn_hidden_trainacc = {}
hidden_size = 50
model = CNN(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'new_CNN_hidden_size_50.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_50.pkl'


In [0]:
import pickle
f = open("cnn_cat_hidden50_trainacc.pkl","wb")
pickle.dump(cnn_hidden_trainacc,f)
f.close()

In [0]:
files.download('cnn_cat_hidden50_trainacc.pkl')

In [0]:
performance_CNN = {}
hidden_size = 50
model_CNN = CNN(hidden_size = hidden_size, kernel_size = 3,padding_size = 1, num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_hidden_size_{}.pkl'.format(hidden_size))
performance_CNN['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
import pickle
g = open("cnn_hidden_size_record.pkl","wb")
pickle.dump(performance_CNN,g)
g.close()

In [0]:
files.download('cnn_hidden_size_record.pkl') 

In [0]:
#cnn_hidden_trainacc = {}
hidden_size = 100
model = CNN(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'new_CNN_hidden_size_100.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_100.pkl'

In [0]:
#cnn_hidden_trainacc = {}
hidden_size = 100
model = CNN(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
#torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
#!cp 'new_CNN_hidden_size_100.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_100.pkl'


In [0]:
import pickle
g = open("cnn_hidden100_acc.pkl","wb")
pickle.dump(cnn_hidden_trainacc,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_hidden100_acc.pkl') 

In [0]:
cnn_hidden_trainacc.keys()

In [0]:
performance_CNN = {}
hidden_size = 100
model_CNN = CNN(hidden_size = hidden_size, kernel_size = 3,padding_size = 1, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_hidden_size_{}.pkl'.format(hidden_size))
performance_CNN['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
from google.colab import files
files.download('CNN_hidden_size_100.pkl') 

In [0]:
#cnn_hidden_trainacc = {}
hidden_size = 200
model = CNN(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'new_CNN_hidden_size_200.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_200.pkl'

In [0]:
#cnn_hidden_trainacc = {}
hidden_size = 200
model = CNN(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
#torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
#!cp 'new_CNN_hidden_size_200.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_200.pkl'

In [0]:
cnn_hidden_trainacc = {}
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)

In [0]:
import pickle
g = open("cnn_hidden200_acc.pkl","wb")
pickle.dump(cnn_hidden_trainacc,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_hidden200_acc.pkl') 

In [0]:
hidden_size = 200
model_CNN = CNN(hidden_size = hidden_size, kernel_size = 3, padding_size = 1, num_layers=1, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_hidden_size_{}.pkl'.format(hidden_size))
performance_CNN['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
cnn_hidden_trainacc = {}
hidden_size = 300
model = CNN(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
#torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
#!cp 'new_CNN_hidden_size_300.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_300.pkl'

In [0]:
import pickle
g = open("cnn_hidden300_acc.pkl","wb")
pickle.dump(cnn_hidden_trainacc,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_hidden300_acc.pkl')

In [0]:
cnn_hidden_trainacc.keys()

In [0]:
performance_CNN = {}
hidden_size = 300
model_CNN = CNN(hidden_size = hidden_size, kernel_size = 3, padding_size = 1, num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_hidden_size_{}.pkl'.format(hidden_size))
performance_CNN['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
!cp 'CNN_hidden_size_300.pkl' 'drive/My Drive/Colab Notebooks/CNN_hidden_size_300.pkl'

In [0]:
test_model(val_loader, model_CNN)

In [0]:
performance_CNN = {}
hidden_size = 300
model_CNN = CNN(hidden_size = hidden_size, kernel_size = 3, padding_size = 1, num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_hidden_size_{}.pkl'.format(hidden_size))
performance_CNN['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
cnn_hidden_trainacc = {}
hidden_size = 400
model = CNN(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
!cp 'new_CNN_hidden_size_400.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_400.pkl'

In [0]:
import pickle
g = open("cnn_hidden_size400_new.pkl","wb")
pickle.dump(cnn_hidden_trainacc,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_hidden_size400_new.pkl') 

In [0]:
hidden_size = 400
model_CNN = CNN(hidden_size = hidden_size, kernel_size = 3, padding_size = 1, num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_hidden_size_{}.pkl'.format(hidden_size))
performance_CNN['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
#cnn_hidden_trainacc = {}
hidden_size = 800
model = CNN(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
#torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
#!cp 'new_CNN_hidden_size_800.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_800.pkl'

In [0]:
cnn_hidden_trainacc.keys()

In [0]:
import pickle
g = open("cnn_hidden_size800_new.pkl","wb")
pickle.dump(cnn_hidden_trainacc,g)
g.close()

In [0]:
import pickle
g = open("cnn_hidden_size800_new.pkl","wb")
pickle.dump(cnn_hidden_trainacc,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_hidden_size800_new.pkl') 

In [0]:
hidden_size = 800
model_CNN = CNN(hidden_size = hidden_size, kernel_size = 3, padding_size = 1, num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_hidden_size_{}.pkl'.format(hidden_size))
performance_CNN['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
model_cnn = CNN(hidden_size = 300, kernel_size = 3, padding_size = 1, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_cnn.cuda()
model_cnn.load_state_dict(torch.load('CNN_hidden_size_300.pkl'))

In [0]:
# VAL ACC for CNN with hidden_size 300 and kernel_size 3 :65.6
test_model(val_loader, model_cnn)

In [0]:
# VAL ACC for CNN with hidden_size 400 and kernel_size 3: 64.9
model_cnn = CNN(hidden_size = 400, kernel_size = 3, padding_size = 1, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_cnn.cuda()
model_cnn.load_state_dict(torch.load('CNN_hidden_size_400.pkl'))
test_model(val_loader, model_cnn)

In [0]:
# VAL ACC for CNN with hidden_size 800 and kernel_size 3: 65.0
model_cnn = CNN(hidden_size = 800, kernel_size = 3, padding_size = 1, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_cnn.cuda()
model_cnn.load_state_dict(torch.load('CNN_hidden_size_800.pkl'))
test_model(val_loader, model_cnn)

In [0]:
g = open("cnn_hidden_size_record3.pkl","wb")
pkl.dump(performance_CNN,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_hidden_size_record3.pkl') 

## 3.2  Kernel Size Tuning for CNN
- Hidden size 400
- Kernel size list (3, 5, 7)

In [0]:
cnn_kernel_trainacc = {}
kernel_size = 5
model = CNN(hidden_size = 400,  kernel_size = kernel_size,padding_size = 2,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'new_CNN_kernel_size_{}.pkl'.format(hidden_size))
cnn_kernel_trainacc['kernel_size_{}'.format(kernel_size)] = (val_acc,train_acc)


In [0]:
g = open("cnn_kernel5_acc.pkl","wb")
pkl.dump(cnn_kernel_trainacc,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_kernel5_acc.pkl')

In [0]:
!cp 'new_CNN_kernel_size_400.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_kernel_size_5.pkl'

In [0]:
cnn_kernel_record = {}
kernel_size = 5
model_CNN = CNN(hidden_size = 400, kernel_size = kernel_size, padding_size = 2,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_kernel_size_{}.pkl'.format(kernel_size))
cnn_kernel_record['kernel_size_{}'.format(kernel_size)] = (train_loss, val_acc)

In [0]:
# VAL ACC for CNN with hidden_size 400 and kernel_size 5: 62
model_cnn = CNN(hidden_size = 400, kernel_size = 5, padding_size = 2, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_cnn.cuda()
model_cnn.load_state_dict(torch.load('CNN_kernel_size_5.pkl'))
test_model(val_loader, model_cnn)

In [0]:
#cnn_kernel_trainacc = {}
kernel_size = 7
model = CNN(hidden_size = 400,  kernel_size = kernel_size,padding_size = 3,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
torch.save(model.state_dict(), 'new_CNN_kernel_size_{}.pkl'.format(kernel_size))
cnn_kernel_trainacc['kernel_size_{}'.format(kernel_size)] = (val_acc,train_acc)
!cp 'new_CNN_kernel_size_7.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_kernel_size_7.pkl'

In [0]:
g = open("cnn_kernel7_acc.pkl","wb")
pkl.dump(cnn_kernel_trainacc,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_kernel7_acc.pkl')

In [0]:
cnn_kernel_record1 = {}
kernel_size = 7
model_CNN = CNN(hidden_size = 400, kernel_size = kernel_size, padding_size = 3,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_CNN.cuda()
train_loss, val_acc = train_model(train_loader,model_CNN)
torch.save(model_CNN.state_dict(), 'CNN_kernel_size_{}.pkl'.format(kernel_size))
cnn_kernel_record1['kernel_size_{}'.format(kernel_size)] = (train_loss, val_acc)

In [0]:
!cp 'CNN_kernel_size_7.pkl' 'drive/My Drive/Colab Notebooks/CNN_kernel_size_7.pkl'

In [0]:
test_model(val_loader, model_CNN)

In [0]:
g = open("cnn_kernel_size_record1.pkl","wb")
pkl.dump(cnn_kernel_record1,g)
g.close()

In [0]:
files.download('cnn_kernel_size_record1.pkl')

In [0]:
# VAL ACC for CNN with hidden_size 400 and kernel_size 7: 65
model_cnn = CNN(hidden_size = 800, kernel_size = 3, padding_size = 1, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_cnn.cuda()
model_cnn.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/CNN_hidden_size_800.pkl'))
test_model(val_loader, model_cnn)

In [0]:
# VAL ACC for CNN with hidden_size 400 and kernel_size 5: 62
model_cnn = CNN(hidden_size = 400, kernel_size = 5, padding_size = 2, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model_cnn.cuda()
model_cnn.load_state_dict(torch.load('CNN_kernel_size_5.pkl'))
test_model(val_loader, model_cnn)

In [0]:
!cp 'CNN_hidden_size_800.pkl' 'drive/My Drive/Colab Notebooks'

In [0]:
g = open("cnn_kernel_size_record.pkl","wb")
pkl.dump(cnn_kernel_record,g)
g.close()

In [0]:
files.download('cnn_kernel_size_record.pkl')

## 3.3 Concatenate two encoded sentences with element-wise multiplication for CNN
- Kernel size: 3
- Hidden size list: (200, 300, 400)

In [0]:
class CNN_mul(nn.Module):
    def __init__(self,  hidden_size, kernel_size, padding_size, num_layers, num_classes, pre_trained_emb):

        super(CNN_mul, self).__init__()

        self.num_layers, self.hidden_size, self.kernel_size ,self.padding_size= num_layers, hidden_size,kernel_size,padding_size
        self.embedding = nn.Embedding(pre_trained_emb.shape[0], pre_trained_emb.shape[1], padding_idx=PAD_IDX)
        #emb_size is the size of imput, hidden_size is the size of output. kernel_size is like the window size, 
        # the kernel size 3 here means read 3 words/chars once
        self.conv1 = nn.Conv1d( pre_trained_emb.shape[1], hidden_size, kernel_size, padding=padding_size)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size, padding=padding_size)

        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, num_classes)
        #self.maxpooling = nn.MaxPool1d()
    def init_weights(self, is_static=True):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pre_trained_emb).float())
        if is_static:
            self.embedding.weight.requires_grad = False
            
    def forward(self, token1_data, token1_lengths,token2_data, token2_lengths):
        batch_size, token1_seq_len = token1_data.size()
        _,token2_seq_len = token2_data.size()

        embed_sent1 = self.embedding(token1_data)
        embed_sent2 = self.embedding(token2_data)
        # the convolusional module in pytorch expects the input of size  batch size by the hidden size by the sequence length
        hidden_sent1 = self.conv1(embed_sent1.transpose(1,2)).transpose(1,2)
        hidden_sent2 = self.conv1(embed_sent2.transpose(1,2)).transpose(1,2)
        # relu expect 2-d tensor as input , merging the 0th and 1st dim together
        hidden_sent1 = F.relu(hidden_sent1.contiguous().view(-1, hidden_sent1.size(-1))).view(batch_size, token1_seq_len, hidden_sent1.size(-1))
        hidden_sent2 = F.relu(hidden_sent2.contiguous().view(-1, hidden_sent2.size(-1))).view(batch_size, token2_seq_len, hidden_sent2.size(-1))
        hidden_sent1 = self.conv2(hidden_sent1.transpose(1,2)).transpose(1,2)
        hidden_sent2 = self.conv2(hidden_sent2.transpose(1,2)).transpose(1,2)
        hidden_sent1 = F.relu(hidden_sent1.contiguous().view(-1, hidden_sent1.size(-1))).view(batch_size, token1_seq_len, hidden_sent1.size(-1))
        hidden_sent2 = F.relu(hidden_sent2.contiguous().view(-1, hidden_sent2.size(-1))).view(batch_size, token2_seq_len, hidden_sent2.size(-1))
        # max-pooling over time
        hidden_sent1 = F.max_pool1d(hidden_sent1.transpose(1,2), kernel_size = token1_seq_len ).transpose(1,2)
        hidden_sent2 = F.max_pool1d(hidden_sent2.transpose(1,2), kernel_size = token2_seq_len ).transpose(1,2)
        hidden = torch.mul(hidden_sent1, hidden_sent2).squeeze()
        fc1_out = self.linear1(hidden)
        fc1_out = self.relu(fc1_out)
        fc2_out = self.linear2(fc1_out)
        preds = F.log_softmax(fc2_out, 1)
        return preds

In [0]:
cnn_mul_hidden_trainacc = {}
hidden_size = 200
model = CNN_mul(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
#torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_mul_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
#!cp 'new_CNN_hidden_size_800.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_800.pkl'

In [0]:
import pickle
g = open("cnn_mul_hidden200_acc.pkl","wb")
pickle.dump(cnn_mul_hidden_trainacc,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_mul_hidden200_acc.pkl')

In [0]:
# VAL ACC for CNN with hidden_size 200 and kernel_size 3:
cnn_mul_record = {}
hidden_size = 200
cnn_mul_200 = CNN_mul(hidden_size = hidden_size ,kernel_size = 3, padding_size = 1, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  cnn_mul_200.cuda()
train_loss, val_acc = train_model(train_loader,cnn_mul_200)
torch.save(cnn_mul_200.state_dict(), 'CNN_mul_hidden_size_{}.pkl'.format(hidden_size))
cnn_mul_record['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
test_model(val_loader, cnn_mul_200)

In [0]:
#cnn_mul_hidden_trainacc = {}
hidden_size = 300
model = CNN_mul(hidden_size = hidden_size,  kernel_size = 3,padding_size = 1,num_layers=2, num_classes=3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
val_acc,train_acc = train_model(train_loader,model)
#torch.save(model.state_dict(), 'new_CNN_hidden_size_{}.pkl'.format(hidden_size))
cnn_mul_hidden_trainacc['hidden_size_{}'.format(hidden_size)] = (val_acc,train_acc)
#!cp 'new_CNN_hidden_size_800.pkl' 'drive/My Drive/Colab Notebooks/new_CNN_hidden_size_800.pkl'

In [0]:
import pickle
g = open("cnn_mul_hidden300_acc.pkl","wb")
pickle.dump(cnn_mul_hidden_trainacc,g)
g.close()

In [0]:
files.download('cnn_mul_hidden300_acc.pkl')

In [0]:
# VAL ACC for CNN with hidden_size 300 and kernel_size 3:
#cnn_mul_record = {}
hidden_size = 300
cnn_mul_300 = CNN_mul(hidden_size = hidden_size ,kernel_size = 3, padding_size = 1, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  cnn_mul_300.cuda()
train_loss, val_acc = train_model(train_loader,cnn_mul_300)
torch.save(cnn_mul_300.state_dict(), 'CNN_mul_hidden_size_{}.pkl'.format(hidden_size))
cnn_mul_record['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
test_model(val_loader, cnn_mul_300)

In [0]:
!cp 'CNN_mul_hidden_size_200.pkl' 'drive/My Drive/Colab Notebooks'
!cp 'CNN_mul_hidden_size_300.pkl' 'drive/My Drive/Colab Notebooks'

In [0]:
# VAL ACC for CNN with hidden_size 400 and kernel_size 3:
#cnn_mul_record = {}
hidden_size = 400
cnn_mul_400 = CNN_mul(hidden_size = hidden_size ,kernel_size = 3, padding_size = 1, num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  cnn_mul_400.cuda()
train_loss, val_acc = train_model(train_loader,cnn_mul_400)
torch.save(cnn_mul_400.state_dict(), 'CNN_mul_hidden_size_{}.pkl'.format(hidden_size))
cnn_mul_record['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)

In [0]:
test_model(val_loader, cnn_mul_400)

In [0]:
test_model(val_loader, cnn_mul_800)

In [0]:
g = open("cnn_mul_record.pkl","wb")
pkl.dump(cnn_mul_record,g)
g.close()

In [0]:
from google.colab import files
files.download('cnn_mul_record.pkl')

In [0]:
!cp 'CNN_mul_hidden_size_200.pkl' 'drive/My Drive/Colab Notebooks'
!cp 'CNN_mul_hidden_size_300.pkl' 'drive/My Drive/Colab Notebooks'
!cp 'CNN_mul_hidden_size_400.pkl' 'drive/My Drive/Colab Notebooks'
#!cp 'CNN_mul_hidden_size_800.pkl' 'drive/My Drive/Colab Notebooks'

# 4. Pick the best model and find 3 correct and 3 incorrect examples

In [0]:
rnn_mul_800 = RNN_mul(hidden_size = 800, num_layers = 1, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  rnn_mul_800.cuda()
rnn_mul_800.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/RNN_mul_hidden_size_800.pkl'))
test_model(val_loader, rnn_mul_800)

In [0]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, lengths1,data2, lengths2, labels in loader:
        data1_batch, lengths1_batch,data2_batch, lengths2_batch, label_batch = data1, lengths1, data2, lengths2,labels
        outputs =model(data1_batch, lengths1_batch,data2_batch, lengths2_batch)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [0]:
#val_dataset_best = NewsGroupDataset(val_data_indices[0:200], y_val[0:200])
val_loader_best = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=200,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)


In [0]:
rnn_mul_800.eval()
for data1, lengths1,data2, lengths2, labels in val_loader_best:
  if use_cuda and torch.cuda.is_available():
    data1 = data1.cuda()
    lengths1  = lengths1.cuda()
    data2 = data2.cuda()
    lengths2  = lengths2.cuda()
    labels = labels.cuda()
  data1_batch, lengths1_batch,data2_batch, lengths2_batch, label_batch = data1, lengths1, data2, lengths2,labels
  outputs = rnn_mul_800(data1_batch, lengths1_batch,data2_batch, lengths2_batch)
  predicted = outputs.max(1, keepdim=True)[1]
  
  wrong_idx = [i[0] for i in (predicted.eq(labels.view_as(predicted))==0).nonzero().cpu().numpy()]
  wrong_predict = [predicted.flatten().cpu().numpy()[i] for i in wrong_idx]
  true_label = [label_batch.flatten().cpu().numpy()[i] for i in wrong_idx]
  break

print ('Wrong index :', wrong_idx)
print('Wrong predict: ', wrong_predict)
print('True label: ', true_label)
        

In [0]:
len(wrong_idx)

### Incorrect 1

In [0]:
print('Sent1:\n ', ' '.join([id2token[i] for i in val_data[1][0] if i!=0]))
print('Sent2:\n ', ' '.join([id2token[i] for i in val_data[1][1] if i!=0]))
print('True Label:',  val_targets[1])
print('Wrong Predict:', wrong_predict[wrong_idx.index(1)])

### Incorrect 2

In [0]:
print('Sent1:\n ', ' '.join([id2token[i] for i in val_data[23][0] if i!=0]))
print('Sent2:\n ', ' '.join([id2token[i] for i in val_data[23][1] if i!=0]))
print('True Label:',  val_targets[23])
print('Wrong Predict:', wrong_predict[wrong_idx.index(23)])

### Incorrect 3

In [0]:
print('Sent1:\n ', ' '.join([id2token[i] for i in val_data[36][0] if i!=0]))
print('Sent2:\n ', ' '.join([id2token[i] for i in val_data[36][1] if i!=0]))
print('True Label:',  val_targets[36])
print('Wrong Predict:', wrong_predict[wrong_idx.index(36)])

### Correct 1

In [0]:
print('Sent1:\n ', ' '.join([id2token[i] for i in val_data[2][0] if i!=0]))
print('Sent2:\n ', ' '.join([id2token[i] for i in val_data[2][1] if i!=0]))
print('True Label:',  val_targets[2])
print('Predict:', predicted.flatten().cpu().numpy()[2])

### Correct 2

In [0]:
print('Sent1:\n ', ' '.join([id2token[i] for i in val_data[27][0] if i!=0]))
print('Sent2:\n ', ' '.join([id2token[i] for i in val_data[27][1] if i!=0]))
print('True Label:',  val_targets[27])
print('Predict:', predicted.flatten().cpu().numpy()[27])

### Correct 3

In [0]:
print('Sent1:\n ', ' '.join([id2token[i] for i in val_data[199][0] if i!=0]))
print('Sent2:\n ', ' '.join([id2token[i] for i in val_data[199][1] if i!=0]))
print('True Label:',  val_targets[199])
print('Predict:', predicted.flatten().cpu().numpy()[199])

# Evaluation on _mnli val_

In [0]:
test_df = pd.read_csv('mnli_val.tsv', sep="\t", index_col=False )
test_df.head()

In [0]:
test_df['genre'].unique()

In [0]:
def df2idx_mnli(fname,genre):
    df = pd.read_csv(fname, sep="\t", index_col=False )
    df = df[df.genre == genre]
    # change the label to numerical value
    df.loc[df['label'] == 'entailment', 'label'] = 0
    df.loc[df['label'] == 'contradiction', 'label'] = 1
    df.loc[df['label'] == 'neutral', 'label'] = 2
    # convert token to idx
    df['sent1_idx']  = df.apply (lambda row:[token2id[token] if token in token2id else UNK_IDX for token in row.sentence1.split()],axis=1)
    df['sent2_idx']  = df.apply (lambda row:[token2id[token] if token in token2id else UNK_IDX for token in row.sentence2.split()],axis=1)
    # convert df to data list and label list
    indexed_data = list(zip(df.sent1_idx, df.sent2_idx))
    label = np.array(df.label)
    return indexed_data, label, df

In [0]:
fic_test_data, fic_test_targets,_= df2idx_mnli("mnli_val.tsv", 'fiction')
tel_test_data, tel_test_targets,_= df2idx_mnli("mnli_val.tsv", 'telephone')
sla_test_data, sla_test_targets,_= df2idx_mnli("mnli_val.tsv", 'slate')
gov_test_data, gov_test_targets,_= df2idx_mnli("mnli_val.tsv", 'government')
tra_test_data, tra_test_targets,_= df2idx_mnli("mnli_val.tsv", 'travel')

In [0]:
fic_train_data, fic_train_targets,_= df2idx_mnli("mnli_train.tsv", 'fiction')
tel_train_data, tel_train_targets,_= df2idx_mnli("mnli_train.tsv", 'telephone')
sla_train_data, sla_train_targets,_= df2idx_mnli("mnli_train.tsv", 'slate')
gov_train_data, gov_train_targets,_= df2idx_mnli("mnli_train.tsv", 'government')
tra_train_data, tra_train_targets,_= df2idx_mnli("mnli_train.tsv", 'travel')

In [0]:
fic_test_dataset =  NewsGroupDataset(fic_test_data, fic_test_targets)
fic_test_loader = torch.utils.data.DataLoader(dataset=fic_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)
tel_test_dataset =  NewsGroupDataset(tel_test_data, tel_test_targets)
tel_test_loader = torch.utils.data.DataLoader(dataset=tel_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)
sla_test_dataset =  NewsGroupDataset(sla_test_data, sla_test_targets)
sla_test_loader = torch.utils.data.DataLoader(dataset=sla_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)
gov_test_dataset =  NewsGroupDataset(gov_test_data, gov_test_targets)
gov_test_loader = torch.utils.data.DataLoader(dataset=gov_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)
tra_test_dataset =  NewsGroupDataset(tra_test_data, tra_test_targets)
tra_test_loader = torch.utils.data.DataLoader(dataset=tra_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

In [0]:
fic_train_dataset =  NewsGroupDataset(fic_train_data, fic_train_targets)
fic_train_loader = torch.utils.data.DataLoader(dataset=fic_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)
tel_train_dataset =  NewsGroupDataset(tel_train_data, tel_train_targets)
tel_train_loader = torch.utils.data.DataLoader(dataset=tel_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)
sla_train_dataset =  NewsGroupDataset(sla_train_data, sla_train_targets)
sla_train_loader = torch.utils.data.DataLoader(dataset=sla_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)
gov_train_dataset =  NewsGroupDataset(gov_train_data, gov_train_targets)
gov_train_loader = torch.utils.data.DataLoader(dataset=gov_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)
tra_train_dataset =  NewsGroupDataset(tra_train_data, tra_train_targets)
tra_train_loader = torch.utils.data.DataLoader(dataset=tra_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

## 5.1 Evaluation of RNN

### 5.1.1 Load Best RNN Model
- hidden size :800
- concatenate two encoded sentences with element-wise multiplication

In [0]:
rnn_mul_800 = RNN_mul(hidden_size = 800, num_layers = 1, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  rnn_mul_800.cuda()
rnn_mul_800.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/RNN_mul_hidden_size_800.pkl'))
#test_model(val_loader, rnn_mul_800)

### 5.1.2 Evaluate across genres and generate val acc table 

In [0]:
rnn_val_acc_genre = []
for data_loader in [fic_test_loader, tel_test_loader, sla_test_loader, gov_test_loader, tra_test_loader]:
  rnn_val_acc_genre.append(test_model(data_loader, rnn_mul_800))
genres = ['fiction', 'telephone', 'slate', 'government', 'travel']
rnn_mnli_acc = pd.DataFrame(data = {'Genre': genres, 'Validation Accuracy':rnn_val_acc_genre})

In [0]:
rnn_mnli_acc

In [0]:
rnn_mnli_acc.to_csv('MNLI_rnn_acc.csv')

## 5.2 Evaluation of CNN

###5.2.1 Load Best CNN Model 
- hidden size: 
- concatenate two encoded sentences with element-wise multiplication

In [0]:
cnn_300 = CNN(hidden_size = 300, kernel_size = 3, padding_size = 1,num_layers = 2, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  cnn_300.cuda()
cnn_300.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/CNN_hidden_size_300.pkl'))
#test_model(val_loader, cnn_300)

### 5.2.2 Evaluate across genres and generate val acc table

In [0]:
cnn_val_acc_genre = []
for data_loader in [fic_test_loader, tel_test_loader, sla_test_loader, gov_test_loader, tra_test_loader]:
  cnn_val_acc_genre.append(test_model(data_loader, cnn_300))
genres = ['fiction', 'telephone', 'slate', 'government', 'travel']
cnn_mnli_acc = pd.DataFrame(data = {'Genre': genres, 'Validation Accuracy':cnn_val_acc_genre})

In [0]:
cnn_mnli_acc

In [0]:
cnn_mnli_acc.to_csv('MNLI_cnn_acc.csv')

# Fine-Tuning MultiNLI

In [0]:
def fine_tuning(train_loader,val_loader, model):
  criterion = torch.nn.NLLLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  total_step = len(train_loader)
  train_loss_ls = []
  val_acc_ls = []
  for epoch in range(num_epochs):
      loss_batch = []
      for i, (data1, lengths1, data2, lengths2, labels) in enumerate(train_loader):
  #         data1 = Variable(data1)  
  #         lengths1 = Variable(lengths1)
  #         data2 = Variable(data2)  
  #         lengths2 = Variable(lengths2)# Convert torch tensor to Variable: change image from a vector of size 784 to a matrix of 28 x 28
  #         labels = Variable(labels)
          if use_cuda and torch.cuda.is_available():
              data1 = data1.cuda()
              lengths1  = lengths1.cuda()
              data2 = data2.cuda()
              lengths2  = lengths2.cuda()
              labels = labels.cuda()
          model.train()
          optimizer.zero_grad()
          # Forward pass
          outputs = model(data1, lengths1, data2, lengths2)
          predicted = outputs.max(1, keepdim=True)[1]
          loss = criterion(outputs, labels)
          loss_batch.append(loss.item())
          # Backward and optimize
          loss.backward()
          optimizer.step()
          # validate every 100 iterations
          if i > 0 and i % 50 == 0:
              # validate
              train_loss = loss_batch[i]
              val_acc = test_model(val_loader, model)
              train_loss_ls.append(train_loss)
              val_acc_ls.append(val_acc)
              print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Training Loss: {}'.format(
                         epoch+1, num_epochs, i+1, len(train_loader), val_acc,train_loss))
  #torch.save(model_object.state_dict(), 'params_{}.pkl'.format())
#model_object.load_state_dict(torch.load('params.pkl'))
  return train_loss_ls, val_acc_ls



In [0]:
num_epochs = 10
fine_tuning_dic = {}

In [0]:
rnn_fic = RNN_mul(hidden_size = 800, num_layers = 1, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  rnn_fic.cuda()
rnn_fic.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/RNN_mul_hidden_size_800.pkl'))
fine_tuning_dic['fic'] = fine_tuning(fic_train_loader,fic_test_loader,rnn_fic )
torch.save(rnn_fic.state_dict(), 'RNN_finetuning_FIC.pkl')
#cnn_mul_record['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)
#!cp 'RNN_finetuning_FIC.pkl' 'drive/My Drive/Colab Notebooks/RNN_finetuning_FIC.pkl'

In [0]:
test_model(fic_test_loader,rnn_fic)

In [0]:
num_epochs = 5
rnn_tel = RNN_mul(hidden_size = 800, num_layers = 1, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  rnn_tel.cuda()
rnn_tel.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/RNN_mul_hidden_size_800.pkl'))
fine_tuning_dic['tel'] = fine_tuning(tel_train_loader,tel_test_loader,rnn_tel )
torch.save(rnn_tel.state_dict(), 'RNN_finetuning_TEL.pkl')
#cnn_mul_record['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)
!cp 'RNN_finetuning_TEL.pkl' 'drive/My Drive/Colab Notebooks/RNN_finetuning_TEL.pkl'

In [0]:
test_model(tel_test_loader,rnn_tel)

In [0]:
num_epochs = 5
rnn_sla= RNN_mul(hidden_size = 800, num_layers = 1, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  rnn_sla.cuda()
rnn_sla.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/RNN_mul_hidden_size_800.pkl'))
fine_tuning_dic['sla'] = fine_tuning(sla_train_loader,sla_test_loader,rnn_sla )
torch.save(rnn_sla.state_dict(), 'RNN_finetuning_SLA.pkl')
#cnn_mul_record['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)
!cp 'RNN_finetuning_SLA.pkl' 'drive/My Drive/Colab Notebooks/RNN_finetuning_SLA.pkl'

In [0]:
test_model(sla_test_loader,rnn_sla)

In [0]:
num_epochs = 5
rnn_gov= RNN_mul(hidden_size = 800, num_layers = 1, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  rnn_gov.cuda()
rnn_gov.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/RNN_mul_hidden_size_800.pkl'))
fine_tuning_dic['gov'] = fine_tuning(gov_train_loader,gov_test_loader,rnn_gov )
torch.save(rnn_gov.state_dict(), 'RNN_finetuning_GOV.pkl')
#cnn_mul_record['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)
!cp 'RNN_finetuning_GOV.pkl' 'drive/My Drive/Colab Notebooks/RNN_finetuning_GOV.pkl'

In [0]:
test_model(gov_test_loader,rnn_gov)

In [0]:
num_epochs = 5
rnn_tra= RNN_mul(hidden_size = 800, num_layers = 1, num_classes = 3, pre_trained_emb = ft_emb)
use_cuda = True
if use_cuda and torch.cuda.is_available():
  rnn_tra.cuda()
rnn_tra.load_state_dict(torch.load('drive/My Drive/Colab Notebooks/RNN_mul_hidden_size_800.pkl'))
fine_tuning_dic['tra'] = fine_tuning(tra_train_loader,tra_test_loader,rnn_gov )
torch.save(rnn_tra.state_dict(), 'RNN_finetuning_TRA.pkl')
#cnn_mul_record['hidden_size_{}'.format(hidden_size)] = (train_loss, val_acc)
!cp 'RNN_finetuning_TRA.pkl' 'drive/My Drive/Colab Notebooks/RNN_finetuning_TRA.pkl'

In [0]:
test_model(tra_test_loader,rnn_tra)