In [None]:
# importing the libraries 
import torch 
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from datasets import load_dataset
import numpy as np
import pandas as pd
import random
from torch import cuda
from pprint import pprint

In [None]:
# defining the CONSTANTS 

EXCLUDE_STOPWORDS = True
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 0.001
EMBEDDING_DIM = 100 
HIDDEN_DIM = 100
GLOVE_PATH = 'glove/glove.6B.100d.txt'
DEVICE = 'cuda'
if cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'



In [None]:
dataset = load_dataset("sst", "default")
glove = {}
with open(GLOVE_PATH, 'r') as f:
    for line in f:
        line = line.split()
        glove[line[0]] = torch.tensor([float(x) for x in line[1:]])

# create a list of stopwords
stop_words = stopwords.words('english')

glove['<unk>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
glove['<pad>'] = torch.zeros(EMBEDDING_DIM)
glove['<start>'] = torch.rand(EMBEDDING_DIM)
glove['<end>'] = torch.rand(EMBEDDING_DIM)

Found cached dataset sst (/home/turning/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)
100%|██████████| 3/3 [00:00<00:00, 154.50it/s]


In [None]:
# making the word_2_idx and idx_2_word dictionaries and the embedding matrix
word_2_idx = {'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}
idx_2_word = {0: '<pad>', 1: '<unk>', 2: '<start>', 3: '<end>'}
embedding_matrix = np.zeros((len(glove.values()), EMBEDDING_DIM))
embedding_matrix[0] = glove['<pad>']
embedding_matrix[1] = glove['<unk>']
embedding_matrix[2] = glove['<start>']
embedding_matrix[3] = glove['<end>']

for i, word in enumerate(glove.keys()):
    if word not in word_2_idx:
        word_2_idx[word] = len(word_2_idx)
        idx_2_word[len(idx_2_word)] = word
        embedding_matrix[word_2_idx[word]] = glove[word]

# convert the embedding matrix to a tensor
embedding_matrix = torch.FloatTensor(embedding_matrix)

In [None]:
# getting ready the datasets : 

raw_datasets = {'train': [], 'validation':[], 'test': []}

for i in dataset:
    for j in dataset[i]:
        if EXCLUDE_STOPWORDS:
            tokens = [word.lower() for word in j['tokens'].split('|') if word.lower() not in stop_words]
        else:
            tokens = [word.lower() for word in j['tokens'].split('|')]
        numbered_tokens = [word_2_idx[word] if word in word_2_idx else word_2_idx['<unk>'] for word in tokens]

        raw_datasets[i].append([numbered_tokens, j['label']])


In [None]:
dataset_pretrain = {'train': [], 'validation':[], 'test': []}
dataset_sem_anl= {'train': [], 'validation':[], 'test': []}

for i in raw_datasets:
    for j in raw_datasets[i]:
        j[0] = [word_2_idx['<start>']] + j[0] + [word_2_idx['<end>']]
        j[0]= torch.LongTensor(j[0])
        dataset_pretrain[i].append({'sentence': j[0], 'label': j[0][:-1]})
        dataset_sem_anl[i].append({'sentence': j[0], 'label': j[1]})

In [None]:
class PretrainDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]['sentence'], self.data[idx]['label']
    
pretrain_dataset = {'train': PretrainDataset(dataset_pretrain['train']), 'validation': PretrainDataset(dataset_pretrain['validation']), 'test': PretrainDataset(dataset_pretrain['test'])}
sman_dataset = {'train': PretrainDataset(dataset_sem_anl['train']), 'validation': PretrainDataset(dataset_sem_anl['validation']), 'test': PretrainDataset(dataset_sem_anl['test'])}

print(pretrain_dataset['train'][0])

(tensor([    2,  1141, 10457,  5037,   593,    13,    54,    32, 18516,    31,
           13,   226,   163, 16810,   155,  1417,  5822,  6684,     5,     1,
         1465, 43711,  4415, 26988,     6,     3]), tensor([    2,  1141, 10457,  5037,   593,    13,    54,    32, 18516,    31,
           13,   226,   163, 16810,   155,  1417,  5822,  6684,     5,     1,
         1465, 43711,  4415, 26988,     6]))


In [None]:
def custom_collate(batch):
    sentences = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    
    # Pad sequences to the maximum length in the batch
    padded_sentences = torch.nn.utils.rnn.pad_sequence(sentences, batch_first=True)
    padded_labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return  padded_sentences,padded_labels

def custom_collate_sman(batch):
    sentences = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    for i , label in enumerate(labels):
        if label <=0.5:
            labels[i]=0
        else :
            labels[i]=1

    
    # Pad sequences to the maximum length in the batch
    padded_sentences = torch.nn.utils.rnn.pad_sequence(sentences, batch_first=True)
    
    return  padded_sentences,labels

pretrain_loaders={}
sman_loaders={}
for i in pretrain_dataset:
    pretrain_loaders[i] = DataLoader(pretrain_dataset[i], batch_size=BATCH_SIZE, collate_fn=custom_collate)
    sman_loaders[i] = DataLoader(sman_dataset[i], batch_size=BATCH_SIZE, collate_fn=custom_collate_sman)





In [None]:
# defing the model which we are going to pretrain
class ELMo(nn.Module):
    '''this class implements the ELMo model using the BI-LSTM architecture like by stacking two LSTM layers 
    the model is just the head and needs body such as linear layer , mlp , etc based on the task  '''
    def __init__(self, embedding_dim,  hidden_dim1, hidden_dim2 ,batch_size, num_layers=2):
        super(ELMo, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.embedding= nn.Embedding.from_pretrained(embedding_matrix)
        self.embedding.weight.requires_grad = False
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim1, num_layers=1, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim1*2, hidden_dim2, num_layers=1, batch_first=True, bidirectional=True)
        self.weight1 = nn.Parameter(torch.randn(1))
        self.weight2 = nn.Parameter(torch.randn(1))
        self.lambda1 = nn.Parameter(torch.randn(1))


    def forward(self, input): 
        # input = [batch_size, seq_len]
        # getting the imput embeddings 
        input_embeddings = self.embedding(input) # [batch_size, seq_len, embedding_dim]
        # passing the embeddings to the first LSTM layer
        output1 , (hidden1, cell1) = self.lstm1(input_embeddings) # [batch_size, seq_len, hidden_dim1]

        # passing the output of the first LSTM layer to the second LSTM layer
        output2 , (hidden2, cell2) = self.lstm2(output1) # [batch_size, seq_len, hidden_dim2]
        # adding the two outputs of the LSTM layers
        
        weighted_output = self.lambda1*((self.weight1 * output1) +( self.weight2 * output2))

        return weighted_output
        

In [None]:
class Language_model(nn.Module):
    '''this class implements the language model using the ELMo model as the head and a linear layer as the body'''
    def __init__(self, Elmo_model, vocab_size, embedding_dim):
        super(Language_model, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.elmo = Elmo_model
        self.linear = nn.Linear(self.embedding_dim, self.vocab_size)
    def forward(self, input):
        # input = [batch_size, seq_len]
        # getting the imput embeddings 
        elmo_output = self.elmo(input) # [batch_size, seq_len, embedding_dim]
        output = self.linear(elmo_output) # [batch_size, seq_len, vocab_size]
        output = F.log_softmax(output, dim=2).permute(0,2,1)[:,:,:-1] # [batch_size, vocab_size, seq_len-1]
        return output
    
class Semantic_analysis(nn.Module):
    '''this class implements the semantic analysis model using the ELMo model as the head and a linear layer as the body'''
    def __init__(self, Elmo_model, vocab_size, embedding_dim, num_classes=2):
        super(Semantic_analysis, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.num_classes = num_classes
        self.elmo = Elmo_model
        self.linear = nn.Linear(self.embedding_dim, self.num_classes)
        
    def forward(self, input):
        # input = [batch_size, seq_len]
        # getting the imput embeddings 
        elmo_output = self.elmo(input) # [batch_size, seq_len, embedding_dim]
        # get length of each sentence 
        lengths = []
        for i in input : 
            flag=1
            for n , j in enumerate(i):
                if j==0:
                    lengths.append(n)
                    flag =0 
                    break
            if flag : 
                lengths.append(len(i))
        # get the sentence embedding by taking the mean of the word embeddings
        sentence_embeddings = []
        for i in range(len(elmo_output)):
            sentence_embeddings.append(torch.mean(elmo_output[i][:lengths[i]], dim=0))
        sentence_embeddings = torch.stack(sentence_embeddings)  # convert list to tensor

        
        output = self.linear(sentence_embeddings) # [batch_size, num_classes]
        output = self.sigmoid(output)
        return output
    



In [None]:
elmo = ELMo(embedding_dim=EMBEDDING_DIM, hidden_dim1=EMBEDDING_DIM//2, hidden_dim2=EMBEDDING_DIM//2, batch_size=BATCH_SIZE)

lm= Language_model()

In [None]:
model = Language_model(elmo, vocab_size=len(glove), embedding_dim=EMBEDDING_DIM)

In [None]:
model.to(DEVICE)
criterion = nn.NLLLoss()

# define the optimizer 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
best_loss = 1000000
best_accuracy = 0
def accuracy(output, label):
    output = output.argmax(dim=1)
    return (output == label).float().mean()
steps = 0

running_loss = 0

for epoch in range(1):
    print('epoch: ', epoch)
    if epoch%3 == 0 and epoch != 0:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr']/2
    for input, label in pretrain_loaders['train']:
        steps += 1
        optimizer.zero_grad()
        model.zero_grad()
        input = input.to(DEVICE)
        label = label.to(DEVICE)
        output = model.forward(input)
        loss = criterion(output, label)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        if steps%15 == 0:
            model.eval()
            with torch.no_grad():
                val_loss = 0
                val_accuracy = 0
                for input, label in pretrain_loaders['validation']:
                    input = input.to(DEVICE)
                    label = label.to(DEVICE)
                    output = model.forward(input)
                    val_loss += criterion(output, label)
                    val_accuracy += accuracy(output, label)
                val_loss = val_loss/len(pretrain_loaders['validation'])
                val_accuracy = val_accuracy/len(pretrain_loaders['validation'])
                if val_loss < best_loss:
                    best_loss = val_loss
                    torch.save(model.state_dict(), 'best_loss.pth')
                if val_accuracy > best_accuracy:
                    best_accuracy = val_accuracy
                    torch.save(model.state_dict(), 'best_accuracy.pth')
                print( 'train loss: ', running_loss/100, 'validation loss: ', val_loss, 'validation accuracy: ', val_accuracy)
                running_loss = 0
            model.train()

epoch:  0


: 

: 

In [None]:
sem = Semantic_analysis(elmo, vocab_size=len(glove), embedding_dim=EMBEDDING_DIM, num_classes=1)

for param in elmo.parameters():
    param.requires_grad = False
elmo.weight1.requires_grad = True
elmo.weight2.requires_grad = True
elmo.lambda1.requires_grad = True
for params in sem.parameters():
    print(params.requires_grad)
def vaccuracy(output, target, threshold=0.5):
    """Computes accuracy for float values between 0 and 1"""
    with torch.no_grad():
    
        predicted = (output >= threshold).float()
        target = ( target >= threshold).float()
        correct = (predicted == target).sum().item()
        total = target.size(0)
        acc = correct / total
    return acc
sem.to(DEVICE)
criterion = nn.BCELoss()

# define the optimizer
optimizer = torch.optim.Adam(sem.parameters(), lr=LEARNING_RATE)
best_loss = 1000000
best_accuracy = 0
steps = 0
EPOCHS= 3
running_loss = 0
for e in range(EPOCHS):
    print('epoch: ', e)
    if e%3 == 0 and e != 0:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr']/2
    for input, label in sman_loaders['train']:
        steps += 1
        optimizer.zero_grad()
        sem.zero_grad()
        input = input.to(DEVICE)

        label = label.to(DEVICE)
        output = sem.forward(input).squeeze(dim=1)
        
        loss = criterion(output, label)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        if steps%15 == 0:
            sem.eval()
            with torch.no_grad():
                val_loss = 0
                val_accuracy = 0
                for input, label in sman_loaders['validation']:
                    input = input.to(DEVICE)
                
                    label = label.to(DEVICE)
                    output = sem.forward(input).squeeze(dim=1)
                   
                    val_loss += criterion(output, label)
                    val_accuracy+= vaccuracy(output , label)
        
                val_loss = val_loss/len(sman_loaders['validation'])
                val_accuracy = val_accuracy/len(sman_loaders['validation'])
                if val_loss < best_loss:
                    best_loss = val_loss
                    torch.save(sem.state_dict(), 'bl.pth')
                if val_accuracy > best_accuracy:
                    best_accuracy = val_accuracy
                    torch.save(sem.state_dict(), 'ba.pth')
                print( 'train loss: ', running_loss/100, 'validation loss: ', val_loss, 'validation accuracy: ', val_accuracy)
                running_loss = 0
            sem.train()

