In [1]:
import pandas as pd
import numpy as np
import regex as re
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns


In [2]:
# Checking if cuda supported GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


device(type='cuda')

In [3]:
train_file = pd.read_csv(
    '../data/augmented_training_file/final_train_file2.csv')
final_test_file = pd.read_csv('../data/final_test_file.csv')


In [4]:
data_per_class = Counter(train_file['label'])
print("Number of rows grouped by label are:")
for i, label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / train_file.shape[0]*100:.3f})% of total train data")


Number of rows grouped by label are:
0 class rows: 175598 (40.240)% of total train data
2 class rows: 112302 (25.735)% of total train data
1 class rows: 148476 (34.025)% of total train data


In [5]:
from sklearn.utils import class_weight  # .compute_class_weight
class_weights = class_weight.compute_class_weight(
    'balanced', np.unique(train_file['label']), train_file['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)
print(
    f"Class weights to be used for CrossEntropyLoss (increases loss to classes with less data to balance the learning)\n {class_weights}")


Class weights to be used for CrossEntropyLoss (increases loss to classes with less data to balance the learning)
 tensor([0.8284, 0.9797, 1.2952])


1         0
2         0
3         0
4         0
         ..
436371    1
436372    1
436373    1
436374    1
436375    1
Name: label, Length: 436376, dtype: int64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


In [6]:
test_split_size = 0.3  # percentage of test split
train_split, test_split = train_test_split(
    train_file, test_size=test_split_size, stratify=train_file['label'], random_state=9)


In [7]:
# Creating customised Image dataset to be used for dataloader
class CustomDataset(Dataset):
    def __init__(self, data_frame):
        # print(data_frame['encoded_titles_combined'])
        self.data = data_frame
        self.label = data_frame['label']
        self.length=len(data_frame)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        # label_encoding = {"unrelated": 0, "agreed": 1, "disagreed": 2}
        # print(type(self.data[idx]), self.data[idx])
        # print(self.data['title1_en_encoded'][idx])
        # print(self.data['title2_en_encoded'][idx])
        title_1 = self.data['title1_en_encoded'][idx]
        title_1= np.array(title_1[1:-1].split(), dtype=int)
        title_1=torch.tensor(title_1)

        title_2 = self.data['title2_en_encoded'][idx]
        title_2= np.array(title_2[1:-1].split(), dtype=int)
        title_2=torch.tensor(title_2)

        label = torch.tensor(self.label[idx]).long()
        data=(title_1,title_2,label)
        return data
    

In [8]:
train_split_dataset=CustomDataset(train_split.reset_index())
test_split_dataset=CustomDataset(test_split.reset_index())

In [9]:
batch = 10
loaders = {
    'train': DataLoader(train_split_dataset, batch_size=batch, shuffle=True, num_workers=0),
    'test': DataLoader(test_split_dataset, batch_size=batch, shuffle=True, num_workers=0),
}


In [10]:

class LSTM_simple(nn.Module):
    def __init__(self,vocab_size,embed_dimension,lstm_units,hidden_dimension):
        super().__init__()
        self.lstm_block=nn.Sequential(
            nn.Embedding(vocab_size,embed_dimension),
            nn.LSTM(embed_dimension,lstm_units,hidden_dimension,bidirectional=True,batch_first=True),
            
        )
        self.out_layer = nn.Linear(2*hidden_dimension, 3)



    def forward(self, title1,title2):
        title1=title1.to(device=device)
        title2=title2.to(device=device)

        title1_lstm_out,(ht_1,ct_1)= self.lstm_block(title1)
        title2_lstm_out, (ht_2, ct_2) = self.lstm_block(title2)


        out = torch.cat((ht_1[-1, :, :], ht_2[-1, :, :]), dim=1)
        # print(ht_1.size(), ht_2.size(), out.size())
        out=self.out_layer(out)
        # lstm_out, (ht, ct) = self.lstm_layer1(embedded_text)
        # ht=torch.cat((ht[-2, :, :], ht[-1, :, :]), dim=1)
        # print(ht.size())
        # ht=self.full_layer1(ht)
        # print(ht.size())
        # ht=self.RELU(ht)
        # ht = self.full_layer2(ht)
        # ht = self.RELU2(ht)
        # ht=self.out_layer(ht)
        return out

        # return self.full_layer1(ht[-1]).to(device=device)


LSTM_simple_model = LSTM_simple(49491, 256, 128, 128).to(device=device) # 49491 is vocab len
optimizer=optim.SGD(LSTM_simple_model.parameters(),lr=0.01,momentum=0.9)
class_weights=class_weights.to(device=device)
loss_func=nn.CrossEntropyLoss(weight=class_weights)

In [11]:
LSTM_simple_model


LSTM_simple(
  (lstm_block): Sequential(
    (0): Embedding(49491, 256)
    (1): LSTM(256, 128, num_layers=128, batch_first=True, bidirectional=True)
  )
  (out_layer): Linear(in_features=256, out_features=3, bias=True)
)

In [12]:
# return the accuracy of the test_split
def validation(model):
    # Test the model
    model.eval()
    with torch.no_grad():
        for a,b, labels in loaders['test']:
            labels = labels.to(device=device)
            test_output = model(a,b)
            loss = loss_func(test_output, labels)
            pred_y = torch.argmax(test_output, 1).data.squeeze()
            accuracy = (pred_y == labels).sum().item() / float(labels.size(0))
        return accuracy, loss
# validation()


In [13]:
num_epochs = 1
training_loss_list = []
train_accuracy_list = []
validation_loss_list = []
validation_accuracy_list = []
def train(num_epochs, LSTM_simple_model, loaders):
    LSTM_simple_model.train()
    # Train the model
    total_step = len(loaders['train'])

    for epoch in range(num_epochs):

        training_loss = []
        train_accuracy = []
        validation_loss = []
        validation_accuracy = []
    
        for i, (data1,data2, labels) in enumerate(loaders['train']):
            step_acc = 0.0
            # gives batch data, normalize x when iterate train_loader
            b_x = Variable(data1)   # batch x1
            b_x2=Variable(data2)
            b_y = Variable(labels)   # batch y
            b_x = b_x.to(device=device)
            b_x2=b_x2.to(device=device)
            b_y = b_y.to(device=device)

            # clear gradients for this training step
            optimizer.zero_grad()
            # predicted output from the net
            output = LSTM_simple_model(b_x, b_x2)
            pred_y = torch.argmax(output, 1).data.squeeze()

            step_acc = (pred_y == b_y).sum().item() / float(labels.size(0))
            #calc cross entropy loss
            loss = loss_func(output, b_y)
            train_accuracy.append(step_acc)
            training_loss.append(loss.item())
            if(i%50==0):
                val_acc, val_loss = validation(LSTM_simple_model)
                validation_accuracy.append(val_acc)
                validation_loss.append(val_loss)
            # backpropagation, compute gradients
            loss.backward()     # apply gradients
            optimizer.step()
            
        
            if(i%10==0):
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, acc: {:.4f}' .format(epoch + 1, num_epochs, i + 1, total_step, loss.item(), step_acc))
        training_loss_list.append(np.mean(training_loss))
        train_accuracy_list.append(np.mean(train_accuracy))
        validation_loss_list.append(np.mean(validation_loss))
        validation_accuracy_list.append(np.mean(validation_accuracy))


train(num_epochs, LSTM_simple_model, loaders)

In [None]:
torch.save(LSTM_simple_model.state_dict(),'./saved_models/lstm_model_siamese.pt')