# Part 2 - LSTM

## Imports

In [None]:
import torch 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import os
import json
import gensim
import re
import torch.optim as optim
from torchtext.vocab import GloVe, FastText
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score
from gensim.models import KeyedVectors
from tqdm.notebook import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load processed data

In [6]:
with open('../data_processed/LR_train_tagged.json') as f: 
    LR_train = json.load(f)
    
with open('../data_processed/LR_test_tagged.json') as f:
    LR_test = json.load(f)
    
with open('../data_processed/LR_val_tagged.json') as f:
    LR_val = json.load(f)

with open('../data_processed/NER_train_tagged.json') as f:
    NER_train = json.load(f)

with open('../data_processed/NER_test_tagged.json') as f:
    NER_test = json.load(f)

with open('../data_processed/NER_val_tagged.json') as f:
    NER_val = json.load(f)


## Load vector embedding models

### Word2Vec


In [10]:
w2vmodel = KeyedVectors.load_word2vec_format('../.vector_cache/GoogleNews-vectors-negative300.bin', binary=True)

### GloVe


In [12]:
# glovemodel = GloVe(name='6B', dim=300)
glove_file = '../.vector_cache/glove.6B.300d.txt'
glovemodel = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glovemodel[word] = vector


### Fasttext

In [13]:
# fasttextmodel = FastText()
# fasttext_file = './.vector_cache/wiki.en.vec'
# fastmodel = {}
# with open(fasttext_file, 'r', encoding='utf-8') as f:
#     next(f)
#     for line in f:
#         values = line.rstrip().split(' ')
#         word = values[0]
#         vector = np.array([x for x in values[1:] if x], dtype='float32')
#         fastmodel[word] = vector

## Creating and Training the Models

### Create list of unique labels - NER

In [31]:
unique_labels = []
for entry in NER_train:
    labels = entry['labels']
    for label in labels:
        unique_labels.append(label)

label_dict_NER = {}
unique_labels = list(set(unique_labels))

for label_index in range(len(unique_labels)):
    label_dict_NER[unique_labels[label_index]] = label_index

# label_dict_NER['PAD'] = len(unique_labels)
# label_dict['UNK'] = len(unique_labels) + 1

output_dict = {}
for key, value in label_dict_NER.items():
    new_key = key.replace('B_', '').replace('I_', '')
    if new_key not in output_dict:
        output_dict[new_key] = len(output_dict)

label_dict_NER = output_dict
label_dict_NER

{'RESPONDENT': 0,
 'PRECEDENT': 1,
 'CASE_NUMBER': 2,
 'OTHER_PERSON': 3,
 'ORG': 4,
 'PETITIONER': 5,
 'PROVISION': 6,
 'COURT': 7,
 'O': 8,
 'STATUTE': 9,
 'WITNESS': 10,
 'JUDGE': 11,
 'DATE': 12,
 'GPE': 13}

### Create list of unique labels - LR

In [15]:
unique_labels = []
for entry in LR_train:
    labels = entry['labels']
    for label in labels:
        unique_labels.append(label)

label_dict_LR = {}
unique_labels = list(set(unique_labels))

for label_index in range(len(unique_labels)):
    label_dict_LR[unique_labels[label_index]] = label_index

# label_dict_LR['PAD'] = len(unique_labels)
# label_dict['UNK'] = len(unique_labels) + 1
label_dict_LR

{'B': 0, 'I': 1, 'O': 2}

### Verifying LR_train

In [8]:
id = 0
for entry in LR_train:
    if (len(entry['text'].split()) != len(entry['labels'])):
        print(entry['id'])
        print(len(entry['text'].split()))
        print(len(entry['labels']))

### Define NER dataset class

In [16]:
class NER_dataset(Dataset):
    def __init__(self, dataset, embedding="word2vec", padding=True):
        # self.padding_word = "PAD"
        self.padding_word = "O"

        if embedding == "word2vec":
            self.model = w2vmodel
            self.model.add_vector(self.padding_word, np.zeros(300))
        elif embedding == "glove":
            self.model = glovemodel
            self.model[self.padding_word] = np.zeros(300)
        else:
            self.model = fastmodel
            self.model[self.padding_word] = np.zeros(300)
            
        self.data = dataset
        self.input = []
        self.labels = []
        
        for i in range(len(self.data)):
            lst = self.data[i]["labels"]
            for j in range(len(lst)):
                if lst[j].startswith("B_"):
                    lst[j] = lst[j][2:]
                elif lst[j].startswith("I_"):
                    lst[j] = lst[j][2:]
            self.data[i]["labels"] = lst

        if padding:
            for entry in self.data:
                padded_entry = entry["text"].split() + [self.padding_word] * (75 - len(entry["text"].split()))
                self.input.append(padded_entry)
                padded_labels = entry["labels"] + [self.padding_word] * (75 - len(entry["labels"]))
                self.labels.append(padded_labels)
        else:
            for entry in self.data:
                self.input.append(entry["text"].split())
                self.labels.append(entry["labels"])
        self.tag_to_index = label_dict_NER

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        sentence = self.input[idx]
        word_vecs = []
        labels = []
        
        # for word_index in range(len(sentence)):
        #     if sentence[word_index] not in self.model:
        #         word_vecs.append(np.zeros(300))
        #         labels.append(self.tag_to_index["O"])
        #         continue
        #     else:
        #         word_vecs.append(self.model[sentence[word_index]])
        #         labels.append(self.tag_to_index[self.labels[idx][word_index]])

        for word_index in range(len(sentence)):
            word = sentence[word_index]
            if word in self.model:
                word_vecs.append(self.model[word])
            else:
                word_vecs.append(self.model[self.padding_word])
            labels.append(self.tag_to_index[self.labels[idx][word_index]])

        return torch.tensor(word_vecs), torch.tensor(labels)

### Define LR Dataset class

In [17]:
class LR_dataset(Dataset):
    def __init__(self, dataset, embedding="word2vec", padding=True):
        self.padding_word = "PAD"

        if embedding == "word2vec":
            self.model = w2vmodel
            self.model.add_vector(self.padding_word, np.zeros(300))
        elif embedding == "glove":
            self.model = glovemodel
            self.model[self.padding_word] = np.zeros(300)
        elif embedding == "fasttext":
            self.model = fastmodel
            self.model[self.padding_word] = np.zeros(300)
        else:
            raise ValueError("This embedding is not handled.")

        self.data = dataset
        self.input = []
        self.labels = []
        if padding:
            for entry in dataset:
                padded_entry = entry["text"].split() + [self.padding_word] * (78 - len(entry["text"].split()))
                self.input.append(padded_entry)
                padded_labels = entry["labels"] + [self.padding_word] * (78 - len(entry["labels"]))
                self.labels.append(padded_labels)
        else:
            for entry in dataset:
                self.input.append(entry["text"].split())
                self.labels.append(entry["labels"])
        self.tag_to_index = label_dict_LR

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        sentence = self.input[idx]
        word_vecs = []
        labels = []

        # for word_index in range(len(sentence)):
        #     if sentence[word_index] not in self.model:
        #         word_vecs.append(np.zeros(300))
        #         labels.append(self.tag_to_index["O"])
        #         continue
        #     else:
        #         word_vecs.append(self.model[sentence[word_index]])
        #         labels.append(self.tag_to_index[self.labels[idx][word_index]])

        for word_index in range(len(sentence)):
            word = sentence[word_index]
            if word in self.model:
                word_vecs.append(self.model[word])
            else:
                word_vecs.append(self.model[self.padding_word])
            labels.append(self.tag_to_index[self.labels[idx][word_index]])

        return torch.tensor(word_vecs), torch.tensor(labels)

In [18]:
# def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):

#     train_losses = []
#     val_losses = []
#     train_f1_scores = []
#     val_f1_scores = []
#     train_accuracy_scores = []
#     val_accuracy_scores = []

#     for epoch in range(num_epochs):
#         train_loss = 0.0
#         val_loss = 0.0
#         all_train_labels = []
#         all_train_preds = []
#         all_val_labels = []
#         all_val_preds = []

#         model.train()
#         for inputs, labels in train_loader:
#             inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
#             outputs, hn = model(inputs)
#             outputs = outputs.reshape(-1, outputs.shape[-1])
#             labels = labels.view(-1)
#             loss = criterion(outputs, labels)
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#             train_loss += loss.item()

#             _, predicted = torch.max(outputs, 1)
#             all_train_labels.extend(labels.cpu().numpy())
#             all_train_preds.extend(predicted.cpu().numpy())

#         train_accuracy = accuracy_score(all_train_labels, all_train_preds)
#         train_f1 = f1_score(all_train_labels, all_train_preds, average='macro')
#         train_accuracy_scores.append(train_accuracy)
#         train_f1_scores.append(train_f1)

#         model.eval()
#         with torch.no_grad():

#             for inputs, labels in val_loader:
#                 inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
#                 outputs, hn = model(inputs)
#                 outputs = outputs.reshape(-1, outputs.shape[-1])
#                 labels = labels.reshape(-1)
#                 loss = criterion(outputs, labels)
#                 val_loss += loss.item()

#                 _, predicted = torch.max(outputs, 1)
#                 all_val_labels.extend(labels.cpu().numpy())
#                 all_val_preds.extend(predicted.cpu().numpy())

#             val_accuracy = accuracy_score(all_val_labels, all_val_preds)
#             val_f1 = f1_score(all_val_labels, all_val_preds, average='macro')
#             val_accuracy_scores.append(val_accuracy)
#             val_f1_scores.append(val_f1)

#         train_losses.append(train_loss/len(train_loader))
#         val_losses.append(val_loss/len(val_loader))
#         print(f"Epoch {epoch + 1}\n"
#               f"Train loss: {train_losses[-1]}, Val loss: {val_losses[-1]}\n"
#               f"Train accuracy: {train_accuracy}, Val accuracy: {val_accuracy}\n"
#               f"Train F1: {train_f1}, Val F1: {val_f1}")
#         print("=====================================================================================================")

#     # return train_losses, val_losses, train_f1_scores, val_f1_scores, train_accuracy_scores, val_accuracy_scores
        

### Train and evaluate after every epoch

In [19]:
def train(model, train_loader, val_loader, criterion, optimizer, num_classes, num_epochs=10):
    train_losses = []
    val_losses = []
    train_f1_scores = []
    val_f1_scores = []
    train_accuracy_scores = []
    val_accuracy_scores = []

    for epoch in range(num_epochs):
        train_loss = 0.0
        val_loss = 0.0
        all_train_labels = []
        all_train_preds = []
        all_val_labels = []
        all_val_preds = []

        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, outputs.shape[-1])
            labels = labels.view(-1)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            all_train_labels.extend(labels.cpu().numpy())
            all_train_preds.extend(predicted.cpu().numpy())

        train_correct = sum(np.array(all_train_labels) == np.array(all_train_preds))
        train_accuracy = train_correct / len(all_train_labels)
        train_accuracy_scores.append(train_accuracy)

        train_true_positives = np.zeros(num_classes)
        train_false_positives = np.zeros(num_classes)
        train_false_negatives = np.zeros(num_classes)

        for label, pred in zip(all_train_labels, all_train_preds):
            train_true_positives[label] += int(label == pred)
            for i in range(num_classes):
                if i != label:
                    train_false_positives[i] += int(pred == i)
                    train_false_negatives[i] += int(label == i)

        train_precisions = np.zeros(num_classes)
        train_recalls = np.zeros(num_classes)
        for i in range(num_classes):
            if train_true_positives[i] + train_false_positives[i] > 0:
                train_precisions[i] = train_true_positives[i] / (train_true_positives[i] + train_false_positives[i])
            if train_true_positives[i] + train_false_negatives[i] > 0:
                train_recalls[i] = train_true_positives[i] / (train_true_positives[i] + train_false_negatives[i])

        train_f1 = np.mean([2 * p * r / (p + r) if p + r > 0 else 0 for p, r in zip(train_precisions, train_recalls)])
        train_f1_scores.append(train_f1)

        model.eval()
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, outputs.shape[-1])
                labels = labels.reshape(-1)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(predicted.cpu().numpy())

        val_correct = sum(np.array(all_val_labels) == np.array(all_val_preds))
        val_accuracy = val_correct / len(all_val_labels)
        val_accuracy_scores.append(val_accuracy)

        val_true_positives = np.zeros(num_classes)
        val_false_positives = np.zeros(num_classes)
        val_false_negatives = np.zeros(num_classes)

        for label, pred in zip(all_val_labels, all_val_preds):
            val_true_positives[label] += int(label == pred)
            for i in range(num_classes):
                if i != label:
                    val_false_positives[i] += int(pred == i)
                    val_false_negatives[i] += int(label == i)

        val_precisions = np.zeros(num_classes)
        val_recalls = np.zeros(num_classes)
        for i in range(num_classes):
            if val_true_positives[i] + val_false_positives[i] > 0:
                val_precisions[i] = val_true_positives[i] / (val_true_positives[i] + val_false_positives[i])
            if val_true_positives[i] + val_false_negatives[i] > 0:
                val_recalls[i] = val_true_positives[i] / (val_true_positives[i] + val_false_negatives[i])

        val_f1 = np.mean([2 * p * r / (p + r) if p + r > 0 else 0 for p, r in zip(val_precisions, val_recalls)])
        val_f1_scores.append(val_f1)

        train_losses.append(train_loss / len(train_loader.dataset))
        val_losses.append(val_loss / len(val_loader.dataset))

        print(f"    Epoch {epoch + 1}\n"
              f"    Train loss: {train_losses[-1]}, Val loss: {val_losses[-1]}\n"
              f"    Train accuracy: {train_accuracy}, Val accuracy: {val_accuracy}\n"
              f"    Train F1: {train_f1}, Val F1: {val_f1}")
        print("=====================================================================================================")

    # return train_losses, val_losses, train_accuracy_scores, val_accuracy_scores, train_f1_scores, val_f1_scores

### Create batched and shuffled loaders for **Training, Validation and Testing** - NER

In [20]:
NER_train_loader = DataLoader(NER_dataset(NER_train, "word2vec", padding=True), batch_size=64, shuffle=True)
NER_val_loader = DataLoader(NER_dataset(NER_val, "word2vec", padding=True), batch_size=64, shuffle=True)
NER_test_loader = DataLoader(NER_dataset(NER_test, "word2vec", padding=True), batch_size=64, shuffle=True)



### Create batched and shuffled loaders for **Training, Validation and Testing** - LR

In [21]:
LR_train_loader = DataLoader(LR_dataset(LR_train, "glove", padding=True), batch_size=64, shuffle=True)
LR_val_loader = DataLoader(LR_dataset(LR_val, "glove", padding=True), batch_size=64, shuffle=True)
LR_test_loader = DataLoader(LR_dataset(LR_test, "glove", padding=True), batch_size=64, shuffle=True)

### Modified NN models

In [22]:
class SequentialModel(nn.Module):
    def __init__(self, label_dict, base_model):
        super().__init__()
        self.rnn = base_model
        self.fc = nn.Linear(128, len(label_dict))
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x, hn = self.rnn(x)
        x = self.fc(x)
        x = self.softmax(x)
        
        return x, hn

### Prepare modified model, criterion, optimizer

In [23]:
model = SequentialModel(label_dict_NER, torch.nn.LSTM(300, 128, 1, batch_first=True))
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

### Grid search for tuning of learning rate

In [29]:
lr_iter = 0.002
print("Searching for lr ...")
print(f"\nlr = {lr_iter:.3f}\n")
optimizer = torch.optim.Adam(model.parameters(), lr_iter)
train(model, NER_train_loader, NER_val_loader, criterion, optimizer, len(label_dict_NER), num_epochs=5)

Searching for lr ... 


lr = 0.001



  return torch.tensor(word_vecs), torch.tensor(labels)


    Epoch 1
    Train loss: 0.05153080914320531, Val loss: 0.05315015140899831
    Train accuracy: 0.5265677349627967, Val accuracy: 0.5332485875706214
    Train F1: 0.1065312178309205, Val F1: 0.13451861247853394
    Epoch 2
    Train loss: 0.051402906420700924, Val loss: 0.053127821724293596
    Train accuracy: 0.534073242715218, Val accuracy: 0.5328907721280602
    Train F1: 0.1287373209876349, Val F1: 0.13256847786028375
    Epoch 3
    Train loss: 0.051380481576069226, Val loss: 0.05312525855619355
    Train accuracy: 0.5067514652699838, Val accuracy: 0.49596045197740113
    Train F1: 0.13138672059564407, Val F1: 0.1312779267945173


KeyboardInterrupt: 

### Training the models

In [None]:
train(model, NER_train_loader, NER_val_loader, criterion, optimizer, len(label_dict_NER), num_epochs=5)

Epoch 1
Train loss: 0.05208251868449091, Val loss: 0.05374029716529415
Train accuracy: 0.5714444860123873, Val accuracy: 0.5073822975517891
Train F1: 0.11406214992862855, Val F1: 0.13064175681475834
Epoch 2
Train loss: 0.051970295306080994, Val loss: 0.053719638599514286
Train accuracy: 0.545629130814316, Val accuracy: 0.5539077212806026
Train F1: 0.13046089785504653, Val F1: 0.12804718917792973
Epoch 3
Train loss: 0.05195019670013835, Val loss: 0.053720997720114926
Train accuracy: 0.5254553768134015, Val accuracy: 0.5284463276836158
Train F1: 0.1303692820036663, Val F1: 0.13036614123476492
Epoch 4
Train loss: 0.051937545738786305, Val loss: 0.053708636322937445
Train accuracy: 0.5250047803134222, Val accuracy: 0.5314689265536723
Train F1: 0.13588158682669876, Val F1: 0.13434833544794997
Epoch 5
Train loss: 0.05192759346524919, Val loss: 0.053716085051412635
Train accuracy: 0.5101151432015629, Val accuracy: 0.4848775894538606
Train F1: 0.13766827241793894, Val F1: 0.12773505733142396


In [None]:
model = torch.nn.RNN(300, len(label_dict_LR), 2, batch_first=True)
# model = torch.nn.LSTM(300, len(label_dict_LR), 1, batch_first=True)
# model = torch.nn.GRU(300, len(label_dict_LR), 1, batch_first=True)
# model = SequentialModel(label_dict_LR)
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
train(model, LR_train_loader, LR_val_loader, criterion, optimizer, num_epochs=5)

Epoch 1
Train loss: 1.502756953239441, Val loss: 1.3631725013256073
Train accuracy: 0.040711496009509254, Val accuracy: 0.049767928907001754
Train F1: 0.07192005568076057, Val F1: 0.08347434607250868
Epoch 2
Train loss: 1.2678397297859192, Val loss: 1.1541132926940918
Train accuracy: 0.06369219448689648, Val accuracy: 0.18200599988679458
Train F1: 0.10194510826826172, Val F1: 0.2025841339466591
Epoch 3
Train loss: 1.078292727470398, Val loss: 0.9914918765425682
Train accuracy: 0.7755136695534046, Val accuracy: 0.9019924152374483
Train F1: 0.4210089303498088, Val F1: 0.46699263247716094
Epoch 4
Train loss: 0.9394924640655518, Val loss: 0.8768957480788231
Train accuracy: 0.9052329201335824, Val accuracy: 0.9088696439689817
Train F1: 0.47050420168262436, Val F1: 0.47501358185201076
Epoch 5
Train loss: 0.8368657901883125, Val loss: 0.7901792228221893
Train accuracy: 0.9101715061979962, Val accuracy: 0.9114450670742061
Train F1: 0.47626451362260286, Val F1: 0.47640940364851114


In [None]:
model.eval()
all_test_labels = []
all_test_preds = []
with torch.no_grad():
    for inputs, labels in NER_test_loader:
        inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
        outputs, hn = model(inputs)
        outputs = outputs.reshape(-1, outputs.shape[-1])
        labels = labels.reshape(-1)

        _, predicted = torch.max(outputs, 1)
        all_test_labels.extend(labels.cpu().numpy())
        all_test_preds.extend(predicted.cpu().numpy())

test_accuracy = accuracy_score(all_test_labels, all_test_preds)
test_f1 = f1_score(all_test_labels, all_test_preds, average='macro')
print(f"Test accuracy: {test_accuracy} Test F1: {test_f1}")

Test accuracy: 0.9632465277777778 Test F1: 0.2976833335124903


In [None]:
model.eval()
all_test_labels = []
all_test_preds = []
with torch.no_grad():
    for inputs, labels in LR_test_loader:
        inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
        outputs, hn = model(inputs)
        outputs = outputs.reshape(-1, outputs.shape[-1])
        labels = labels.reshape(-1)

        _, predicted = torch.max(outputs, 1)
        all_test_labels.extend(labels.cpu().numpy())
        all_test_preds.extend(predicted.cpu().numpy())

test_accuracy = accuracy_score(all_test_labels, all_test_preds)
test_f1 = f1_score(all_test_labels, all_test_preds, average='macro')
print(f"Test accuracy: {test_accuracy} Test F1: {test_f1}")

Test accuracy: 0.9730995641591668 Test F1: 0.4869017166670912


In [None]:
for input, label, sentence in NER_train_loader:
    print(sentence)
    outputs, hn = model(input.to(device).to(torch.float32))
    # print label and prediction
    _, predicted = torch.max(outputs, 2)
    print(label)
    print(predicted)

    print(outputs.shape)
    break
    

('The injuries of Shish Pal (D-3) are as follows: (i) Incised Wounds at (Lt) pareital area of skull at top size 12 cm x 1-1/2 cms x Brain matter deep obliquely & Tailing Backwards. (ii) Multiple Incised Wounds at (Lt) Head Laterally & at Back side. ', 'The learned Counsel for the parties informed that the matter is still not decided by the larger Bench of the Supreme Court. ', 'therefore, she was not entitled to any damages. ', 'Some of those 15 also had injuries on their persons which appeared to be due to lathi charge at village Khabra. ', 'I am unable to subscribe to the Second reasoning given by Piggott J. in AIR 1916 All 51 that if an appeal against a decree can lie in certain circumstances the decree may be regarded as one open to appeal for purposes of Order 43 Rule 1(d). ', 'According to Sri Haranahalli, the complaint as against respondent/accused No. 2 was definitely maintainable and could be proceeded with against him in view of no bar thereto under section 446 of the Compani