In [38]:
# importing the libraries 
import torch 
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from datasets import load_dataset
import numpy as np
import pandas as pd
import random
from torch import cuda
from pprint import pprint
import re

[nltk_data] Downloading package punkt to /home/turning/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/turning/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
# defining the CONSTANTS 
EXCLUDE_STOPWORDS = True
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 0.001
EMBEDDING_DIM = 100 
HIDDEN_DIM = 100
GLOVE_PATH = 'glove/glove.6B.100d.txt'
DEVICE = 'cuda'
if cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'


In [49]:
dataset = load_dataset("multi_nli", "default")


Found cached dataset multi_nli (/home/turning/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)
100%|██████████| 3/3 [00:00<00:00, 112.06it/s]


In [None]:
glove = {}
with open(GLOVE_PATH, 'r') as f:
    for line in f:
        line = line.split()
        glove[line[0]] = torch.tensor([float(x) for x in line[1:]])

# create a list of stopwords
stop_words = stopwords.words('english')

glove['<unk>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
glove['<pad>'] = torch.zeros(EMBEDDING_DIM)
glove['<start>'] = torch.rand(EMBEDDING_DIM)
glove['<end>'] = torch.rand(EMBEDDING_DIM)

In [41]:
# making the word_2_idx and idx_2_word dictionaries and the embedding matrix
word_2_idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}
idx_2_word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}
embedding_matrix = np.zeros((len(glove.values()), EMBEDDING_DIM))
embedding_matrix[0] = glove['<pad>']
embedding_matrix[1] = glove['<unk>']
embedding_matrix[2] = glove['<start>']
embedding_matrix[3] = glove['<end>']

for i, word in enumerate(glove.keys()):
    if word not in word_2_idx:
        word_2_idx[word] = len(word_2_idx)
        idx_2_word[len(idx_2_word)] = word
        embedding_matrix[word_2_idx[word]] = glove[word]

# convert the embedding matrix to a tensor
embedding_matrix = torch.FloatTensor(embedding_matrix)



In [50]:
new_dataset = {}
new_dataset['train'] = dataset['train'][:5]
new_dataset['validation'] = dataset['validation_matched'][:5]
dataset=new_dataset


In [51]:

random.seed(42)
random.shuffle(new_dataset['train']['premise'])
random.seed(42)
random.shuffle(new_dataset['train']['hypothesis'])
random.seed(42)
random.shuffle(new_dataset['train']['label'])
random.seed(42)
random.shuffle(new_dataset['validation']['premise'])
random.seed(42)
random.shuffle(new_dataset['validation']['hypothesis'])
random.seed(42)
random.shuffle(new_dataset['validation']['label'])



['Product and geography are what make cream skimming work. ',
 'You lose the things to the following level if the people recall.',
 'A member of my team will execute your orders with immense precision.',
 'This information belongs to them.',
 'The tennis shoes have a range of prices.']
['Conceptually cream skimming has two basic dimensions - product and '
 'geography.',
 'you know during the season and i guess at at your level uh you lose them to '
 'the next level if if they decide to recall the the parent team the Braves '
 'decide to call to recall a guy from triple A then a double A guy goes up to '
 'replace him and a single A guy goes up to replace him',
 'One of our number will carry out your instructions minutely.',
 'How do you know? All this is their information again.',
 'yeah i tell you what though if you go price some of those tennis shoes i can '
 "see why now you know they're getting up in the hundred dollar range"]
['This information belongs to them.',
 'You lose the th

In [22]:
raw_datasets = {'train': [], 'validation':[]}
cat_to_name={'entailment': 0, 'neutral': 1, 'contradiction': 2}
#entailment (0), neutral (1), contradiction (2)




def preprocessing(sentence, stop_words_remove):
    sentence = sentence.split(' ')
    if stop_words_remove:
        sentence = [word.lower() for word in sentence if word.lower() not in stop_words]
    else:
        sentence = [word.lower() for word in sentence]
    sentence = ['<start> '] + sentence+ ['<end>']
    sentence = [word_2_idx[word] if word in word_2_idx else word_2_idx['<unk>'] for word in sentence]
    return sentence


# convertng the dataset into list of dicts 
raw_datasets = {'train': [], 'validation':[]}
for i in dataset:
    # for j in (range(len(dataset[i]['genre']))):
    print(len(dataset[i]))

    for j in range(len(dataset[i]['premise'])):

        if dataset[i]['label'][j]== -1:
            continue
       
        tokens = preprocessing(dataset[i]['premise'][j], EXCLUDE_STOPWORDS)
        tokens_hypothesis = preprocessing(dataset[i]['hypothesis'][j], EXCLUDE_STOPWORDS)
        
      
        
        raw_datasets[i].append({'premise': tokens, 'hypothesis': tokens_hypothesis, 'label': dataset[i]['label'][j]})        

            

In [26]:
dataset_pretrain = {'train': [], 'validation':[]}
dataset_nli = {'train': [], 'validation':[]}
for i in raw_datasets:
    for j in raw_datasets[i]:
        j['premise'] = torch.LongTensor(j['premise'])
        j['hypothesis'] = torch.LongTensor(j['hypothesis'])
        j['label'] = torch.LongTensor([j['label']])
        dataset_pretrain[i].append({'sentence': j['premise'], 'label': j['premise'][1:]})
        dataset_pretrain[i].append({'sentence': j['hypothesis'], 'label': j['hypothesis'][1:]})
        dataset_nli[i].append({'sentence': (j['premise'] , j['hypothesis']), 'label': j['label']})



In [28]:
class PretrainDataset(Dataset):
    def __init__(self, data):
        random.shuffle(data)
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]['sentence'], self.data[idx]['label']
    
pretrain_dataset = {'train': PretrainDataset(dataset_pretrain['train']), 'validation': PretrainDataset(dataset_pretrain['validation'])}
nli_dataset = {'train': PretrainDataset(dataset_nli['train']), 'validation': PretrainDataset(dataset_nli['validation']) }

print(nli_dataset['train'][0])

(([1, 160, 16544, 5079, 1, 113, 1572, 140454, 11637, 4226, 1253, 245, 5079, 53149, 2159, 643, 6066, 33033, 6, 1, 3], [1, 245, 5079, 53149, 3792, 3466, 3854, 9346, 6, 1, 3]), 2)


In [31]:
def custom_collate(batch):
    sentences = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    
    # Pad sequences to the maximum length in the batch
    padded_sentences = torch.nn.utils.rnn.pad_sequence(sentences, batch_first=True)
    padded_labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return  padded_sentences,padded_labels

def custom_collate_nli(batch):
    premises, hypothesis = [item[0][0] for item in batch], [item[0][1] for item in batch]
    labels = [item[1] for item in batch]
  
    # Pad sequences to the maximum length in the batch
    padded_premises = torch.nn.utils.rnn.pad_sequence(premises, batch_first=True)
    padded_hypothesis = torch.nn.utils.rnn.pad_sequence(hypothesis, batch_first=True)
    labels = torch.LongTensor(labels)
    
    return  (padded_premises, padded_hypothesis),labels

In [32]:
pretrain_loaders={}
nli_loaders={}
for i in pretrain_dataset:
    pretrain_loaders[i] = DataLoader(pretrain_dataset[i], batch_size=BATCH_SIZE, collate_fn=custom_collate)
    nli_loaders[i] = DataLoader(nli_dataset[i], batch_size=BATCH_SIZE, collate_fn=custom_collate_nli)


In [36]:
# defing the model which we are going to pretrain
class ELMo(nn.Module):
    '''this class implements the ELMo model using the BI-LSTM architecture like by stacking two LSTM layers 
    the model is just the head and needs body such as linear layer , mlp , etc based on the task  '''
    def __init__(self, embedding_dim,  hidden_dim1, hidden_dim2 ,batch_size, num_layers=2):
        super(ELMo, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.embedding= nn.Embedding.from_pretrained(embedding_matrix)
        self.embedding.weight.requires_grad = False
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim1, num_layers=1, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim1*2, hidden_dim2, num_layers=1, batch_first=True, bidirectional=True)
        self.weight1 = nn.Parameter(torch.randn(1))
        self.weight2 = nn.Parameter(torch.randn(1))
        self.lambda1 = nn.Parameter(torch.randn(1))


    def forward(self, input): 
        # input = [batch_size, seq_len]
        # getting the imput embeddings 
        input_embeddings = self.embedding(input) # [batch_size, seq_len, embedding_dim]
        # passing the embeddings to the first LSTM layer
        output1 , (hidden1, cell1) = self.lstm1(input_embeddings) # [batch_size, seq_len, hidden_dim1]

        # passing the output of the first LSTM layer to the second LSTM layer
        output2 , (hidden2, cell2) = self.lstm2(output1) # [batch_size, seq_len, hidden_dim2]
        # adding the two outputs of the LSTM layers
        
        weighted_output = self.lambda1*((self.weight1 * output1) +( self.weight2 * output2))

        return weighted_output
        

In [37]:
class Language_model(nn.Module):
    '''this class implements the language model using the ELMo model as the head and a linear layer as the body'''
    def __init__(self, Elmo_model, vocab_size, embedding_dim):
        super(Language_model, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.elmo = Elmo_model
        self.linear = nn.Linear(self.embedding_dim, self.vocab_size)
    def forward(self, input):
        # input = [batch_size, seq_len]
        # getting the imput embeddings 
        elmo_output = self.elmo(input) # [batch_size, seq_len, embedding_dim]
        output = self.linear(elmo_output) # [batch_size, seq_len, vocab_size]
        output = F.log_softmax(output, dim=2).permute(0,2,1)[:,:,:-1] # [batch_size, vocab_size, seq_len-1]
        return output
    

In [None]:
class NLI(nn.Module): 

    def __init__(self, Elmo_model, embedding_dim, num_classes=3):
        super(NLI, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_classes = num_classes
        self.elmo = Elmo_model
        self.linear = nn.Linear(self.embedding_dim*2,25)
        self.dropout = nn.Dropout(0.5)
        self.linear2 = nn.Linear(25, self.num_classes)
        
    def forward(self, input):
        # input = [batch_size,(premise, hypothesis)]
        premise = input[:,0] # [batch_size, seq_len]
        hypothesis = input[:,1] # [batch_size, seq_len]
        # getting the imput embeddings
        elmo_output_premise = self.elmo(premise) # [batch_size, seq_len, embedding_dim]
        elmo_output_hypothesis = self.elmo(hypothesis) # [batch_size, seq_len, embedding_dim]
        sentence_embeddings_premise = []
        sentence_embeddings_hypothesis = []
        for i in range(len(elmo_output_premise)):
            sentence_embeddings_premise.append(torch.mean(elmo_output_premise[i], dim=0))
        for i in range(len(elmo_output_hypothesis)):
            sentence_embeddings_hypothesis.append(torch.mean(elmo_output_hypothesis[i], dim=0))
        sentence_embeddings_input = sentence_embeddings_premise + sentence_embeddings_hypothesis
        sentence_embeddings = torch.stack(sentence_embeddings_input)  # convert list to tensor
        output1 = self.linear(sentence_embeddings) # [batch_size, num_classes]
        output1 = self.dropout(output1)
        output = self.linear2(output1)
        output = F.log_softmax(output, dim=1) # [batch_size, num_classes]

        return output
    