In [14]:
# importing the libraries 
import torch 
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from datasets import load_dataset
import numpy as np
import pandas as pd
import random
from torch import cuda


[nltk_data] Downloading package punkt to /home/turning/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/turning/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# defining the CONSTANTS 

EXCLUDE_STOPWORDS = True
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 0.001
EMBEDDING_DIM = 100 
HIDDEN_DIM = 100
GLOVE_PATH = 'glove/glove.6B.100d.txt'
DEVICE = 'cuda'
if cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'



In [16]:
# downloading the dataset and loading the glove embeddings 
dataset = load_dataset("sst", "default")
glove = {}
with open(GLOVE_PATH, 'r') as f:
    for line in f:
        line = line.split()
        glove[line[0]] = torch.tensor([float(x) for x in line[1:]])

# create a list of stopwords
stop_words = stopwords.words('english')

glove['<unk>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
glove['<pad>'] = torch.zeros(EMBEDDING_DIM)
glove['<start>'] = torch.rand(EMBEDDING_DIM)
glove['<end>'] = torch.rand(EMBEDDING_DIM)

Found cached dataset sst (/home/turning/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)
100%|██████████| 3/3 [00:00<00:00, 1461.94it/s]


In [17]:
# making the word_2_idx and idx_2_word dictionaries and the embedding matrix
word_2_idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}
idx_2_word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}
embedding_matrix = np.zeros((len(glove.values()), EMBEDDING_DIM))
embedding_matrix[0] = glove['<pad>']
embedding_matrix[1] = glove['<unk>']
embedding_matrix[2] = glove['<start>']
embedding_matrix[3] = glove['<end>']

for i, word in enumerate(glove.keys()):
    if word not in word_2_idx:
        word_2_idx[word] = len(word_2_idx)
        idx_2_word[len(idx_2_word)] = word
        embedding_matrix[word_2_idx[word]] = glove[word]

# convert the embedding matrix to a tensor
embedding_matrix = torch.FloatTensor(embedding_matrix)

In [18]:
# defing the model which we are going to pretrain
class ELMo(nn.Module):
    '''this class implements the ELMo model using the BI-LSTM architecture like by stacking two LSTM layers'''
    def __init__(self, embedding_dim, vocab_size,  hidden_dim1, hidden_dim2 ,batch_size, num_layers=2):
        super(ELMo, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.vocb_size =  vocab_size
        self.embedding= nn.Embedding.from_pretrained(embedding_matrix)
        self.embedding.weight.requires_grad = False
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim1, num_layers=1, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim1*2, hidden_dim2, num_layers=1, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim2*2, vocab_size)


    def forward(self, input): 
        # input = [batch_size, seq_len]
        # getting the imput embeddings 
        input_embeddings = self.embedding(input) # [batch_size, seq_len, embedding_dim]
        # passing the embeddings to the first LSTM layer
        self.output1 , (hidden1, cell1) = self.lstm1(input_embeddings) # [batch_size, seq_len, hidden_dim1]

        # passing the output of the first LSTM layer to the second LSTM layer
        self.output2 , (hidden2, cell2) = self.lstm2(self.output1) # [batch_size, seq_len, hidden_dim2]
        # adding the two outputs of the LSTM layers
    
        # output = [batch_size, seq_len, vocab_size]
        output = self.linear(self.output2)
        output_softmax = F.log_softmax(output, dim=2)
        # removing the last token from the output as we are pretraing the model 
        output_softmax = output_softmax.permute(0,2,1)[:,:,:-1]

        return output_softmax
    
    def get_weighted_outputs(self, input):
        '''this function returns the weighted outputs of the two LSTM layers'''

        # getting the output embeddings also freezing the parameters of the  lstm layers
        with torch.no_grad():
            input_embeddings = self.embedding(input)
            output1 , (hidden1, cell1) = self.lstm1(input_embeddings)
            output2 , (hidden2, cell2) = self.lstm2(output1)
        # getting the weights for the weighted sum of the two outputs 
        weight1 = nn.Parameter(torch.randn(1))
        weight2 = nn.Parameter(torch.randn(1))
        lambda1 = nn.Parameter(torch.randn(1))
        weighted_output = lambda1(weight1 * output1 + weight2 * output2)

        return weighted_output        
        

In [19]:
# making the datasets like tokenising them 
prediction_raw_datasets={}
prediction_raw_datasets['train'] = [ i.lower().split('|') for i in dataset['train']['tokens']]
prediction_raw_datasets['validation'] = [ i.lower().split('|') for i in dataset['validation']['tokens']]
prediction_raw_datasets['test'] = [ i.lower().split('|') for i in dataset['test']['tokens']]

for k , v in prediction_raw_datasets.items():
    for i in range(len(v)):
        if EXCLUDE_STOPWORDS:
            v[i] = [word for word in v[i] if word not in stop_words]
        for j in range(len(v[i])):
            if v[i][j] not in word_2_idx:
                v[i][j] = '<unk>'

        v[i]= ['<start>'] + v[i] + ['<end>']
        v[i] = [word_2_idx[word] for word in v[i]]
        

In [20]:
# making the datasets with sentence and label
datasets = {'train': [], 'validation': [], 'test': []}
for i in range(len(prediction_raw_datasets['train'])):  
    sentence = torch.LongTensor(prediction_raw_datasets['train'][i])                                        
    datasets['train'].append({'sentence': sentence, 'label': sentence[:-1]})
for i in range(len(prediction_raw_datasets['validation'])):  
    sentence = torch.LongTensor(prediction_raw_datasets['validation'][i])                                        
    datasets['validation'].append({'sentence': sentence, 'label': sentence[1:]})
for i in range(len(prediction_raw_datasets['test'])):
    sentence = torch.LongTensor(prediction_raw_datasets['test'][i])                                        
    datasets['test'].append({'sentence': sentence, 'label': sentence[1:]})

In [23]:
# definig the obejct model
model = ELMo(EMBEDDING_DIM, len(glove), HIDDEN_DIM, EMBEDDING_DIM//2, 1)

In [25]:
model.to(DEVICE)
criterion = nn.NLLLoss()

# define the optimizer 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
best_loss = 1000000
best_accuracy = 0
def accuracy(output, label):
    output = output.argmax(dim=1)
    return (output == label).float().mean()


for epoch in range(10):
    print('epoch: ', epoch)
    if epoch%3 == 0 and epoch != 0:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr']/2
    for i in range(len(datasets['test'])):
        optimizer.zero_grad()
        model.zero_grad()
        input = datasets['train'][i]['sentence'].unsqueeze(0)
        label = datasets['train'][i]['label'].unsqueeze(0)
        input = input.to(DEVICE)
        label = label.to(DEVICE)
        output = model.forward(input)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()


        if i%1000 == 0:
            print('loss: ', loss.item())
            print('accuracy: ', accuracy(output, label))
            print('-------------------------------------')

epoch:  0
loss:  12.484819412231445
accuracy:  tensor(0.0800)
-------------------------------------


KeyboardInterrupt: 