# Imports

In [2]:
import torch 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import os
import json
import gensim
import re
import torch.optim as optim
# from torchtext.vocab import GloVe
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score
from gensim.models import KeyedVectors
from tqdm.notebook import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [70]:
with open('./processed/LR_train_tagged.json') as f: 
    LR_train = json.load(f)
    
with open('./processed/LR_test_tagged.json') as f:
    LR_test = json.load(f)
    
with open('./processed/LR_val_tagged.json') as f:
    LR_val = json.load(f)

with open('./processed/NER_train_tagged.json') as f:
    NER_train = json.load(f)

with open('./processed/NER_test_tagged.json') as f:
    NER_test = json.load(f)

with open('./processed/NER_val_tagged.json') as f:
    NER_val = json.load(f)


In [9]:
w2vmodel = KeyedVectors.load_word2vec_format('./.vector_cache/GoogleNews-vectors-negative300.bin', binary=True)
# glovemodel = GloVe(name='6B', dim=300)

In [77]:
unique_labels = []
for entry in NER_train:
    labels = entry['labels']
    for label in labels:
        unique_labels.append(label)

label_dict = {}
unique_labels = list(set(unique_labels))

for label_index in range(len(unique_labels)):
    label_dict[unique_labels[label_index]] = label_index


label_dict['PAD'] = len(unique_labels)
# label_dict['UNK'] = len(unique_labels) + 1
print(label_dict)

{'B_PETITIONER': 0, 'B_WITNESS': 1, 'I_CASE_NUMBER': 2, 'I_GPE': 3, 'B_JUDGE': 4, 'I_STATUTE': 5, 'B_GPE': 6, 'B_PROVISION': 7, 'B_STATUTE': 8, 'B_CASE_NUMBER': 9, 'I_PETITIONER': 10, 'B_DATE': 11, 'I_PROVISION': 12, 'B_ORG': 13, 'I_WITNESS': 14, 'I_JUDGE': 15, 'B_OTHER_PERSON': 16, 'O': 17, 'I_OTHER_PERSON': 18, 'I_RESPONDENT': 19, 'B_RESPONDENT': 20, 'B_COURT': 21, 'I_COURT': 22, 'I_DATE': 23, 'B_PRECEDENT': 24, 'I_PRECEDENT': 25, 'I_ORG': 26, 'PAD': 27}


In [71]:
maxlen = 0
id = 0
for entry in NER_train:
    if (len(entry['text'].split()) != len(entry['labels'])):
        print(entry['id'])
        print(len(entry['text'].split()))
        print(len(entry['labels']))


In [43]:
class NER_dataset(Dataset):
    def __init__(self, dataset, embedding="word2vec",padding = True):
        self.padding_word = "PAD"
        if embedding == "word2vec":
            self.model = w2vmodel
        elif embedding == "glove":
            self.model = glovemodel
        self.data = dataset
        self.input = []
        self.labels = []
        if padding:
            for entry in dataset:
                padded_entry = entry["text"].split() + [self.padding_word] * (75- len(entry["text"].split()))
                self.input.append(padded_entry)
                padded_labels = entry["labels"] + [self.padding_word] * (75 - len(entry["labels"]))
                self.labels.append(padded_labels)
        else:
            for entry in dataset:
                self.input.append(entry["text"].split())
                self.labels.append(entry["labels"])
        self.tag_to_index = label_dict

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        sentence = self.input[idx]
        word_vecs = []
        labels = []
        for word_index in range(len(sentence)):
            if sentence[word_index] not in self.model:
                word_vecs.append(np.zeros(300))
                labels.append(self.tag_to_index["O"])
                continue
            else:
                word_vecs.append(self.model[sentence[word_index]])
                labels.append(self.tag_to_index[self.labels[idx][word_index]])
            
            

        return torch.tensor(word_vecs), torch.tensor(labels)

In [90]:
NER_train_loader = DataLoader(NER_dataset(NER_train, "word2vec",padding= True), batch_size=256, shuffle=True)
NER_val_loader = DataLoader(NER_dataset(NER_val, "word2vec",padding= True), batch_size=256, shuffle=True)
NER_test_loader = DataLoader(NER_dataset(NER_test, "word2vec",padding= True), batch_size=25, shuffle=True)

In [91]:
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):

    train_losses = []
    val_losses = []
    train_f1_scores = []
    val_f1_scores = []
    train_accuracy_scores = []
    val_accuracy_scores = []

    for epoch in range(num_epochs):
        train_loss = 0.0
        val_loss = 0.0
        all_train_labels = []
        all_train_preds = []
        all_val_labels = []
        all_val_preds = []

        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
            outputs, hn = model(inputs)
            # print("old shape: ", outputs.shape)
            outputs = outputs.reshape(-1, outputs.shape[-1])
            # print("new shape: ", outputs.shape)
            labels = labels.reshape(-1)
            # print("labels shape: ", labels.shape)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            all_train_labels.extend(labels.cpu().numpy())
            all_train_preds.extend(predicted.cpu().numpy())

        train_accuracy = accuracy_score(all_train_labels, all_train_preds)
        train_f1 = f1_score(all_train_labels, all_train_preds, average='macro')
        train_accuracy_scores.append(train_accuracy)
        train_f1_scores.append(train_f1)

        model.eval()
        with torch.no_grad():

            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
                outputs, hn = model(inputs)
                outputs = outputs.reshape(-1, outputs.shape[-1])
                labels = labels.reshape(-1)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(predicted.cpu().numpy())

            val_accuracy = accuracy_score(all_val_labels, all_val_preds)
            val_f1 = f1_score(all_val_labels, all_val_preds, average='macro')
            val_accuracy_scores.append(val_accuracy)
            val_f1_scores.append(val_f1)

        train_losses.append(train_loss/len(train_loader))
        val_losses.append(val_loss/len(val_loader))
        print(f"Epoch {epoch + 1}\n"
              f"Train loss: {train_losses[-1]}, Val loss: {val_losses[-1]}\n"
              f"Train accuracy: {train_accuracy}, Val accuracy: {val_accuracy}\n"
              f"Train F1: {train_f1}, Val F1: {val_f1}")
        print("=====================================================================================================")
        

In [94]:
class SequentialModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.LSTM(300, len(label_dict), batch_first=True)
        self.softmax = nn.Softmax(dim=1)

        self.relu = nn.ReLU()

    def forward(self, x):
        x, hn = self.rnn(x)
        x = self.softmax(x)
        return x, hn

In [99]:
# model = torch.nn.RNN(300, len(label_dict), 2, batch_first=True)
# model = torch.nn.LSTM(300, len(label_dict), 1, batch_first=True)
# model = torch.nn.GRU(300, len(label_dict), 1, batch_first=True)
model = SequentialModel()
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [100]:
train(model, NER_train_loader, NER_val_loader, criterion, optimizer, num_epochs=5)

Epoch 1
Train loss: 3.3267336562275887, Val loss: 3.3243383963902793
Train accuracy: 0.402898116972191, Val accuracy: 0.8316854990583804
Train F1: 0.0526225646518862, Val F1: 0.07830649883329392
Epoch 2
Train loss: 3.3239387944340706, Val loss: 3.3236769437789917
Train accuracy: 0.8427418215072536, Val accuracy: 0.8383333333333334
Train F1: 0.0782981264827448, Val F1: 0.07835295189758706
Epoch 3
Train loss: 3.3234868869185448, Val loss: 3.3232593139012656
Train accuracy: 0.8394679303321279, Val accuracy: 0.8445574387947269
Train F1: 0.08068436855650964, Val F1: 0.09280079798744871
Epoch 4
Train loss: 3.3231963738799095, Val loss: 3.3231091499328613
Train accuracy: 0.8632265037203308, Val accuracy: 0.8744256120527307
Train F1: 0.10376711628339848, Val F1: 0.10728149288169488
Epoch 5
Train loss: 3.3230617344379425, Val loss: 3.3229779402414956
Train accuracy: 0.8775774202934696, Val accuracy: 0.8794632768361582
Train F1: 0.11399479032489397, Val F1: 0.11477097275680814


In [140]:
model.eval()
all_test_labels = []
all_test_preds = []
with torch.no_grad():
    for inputs, labels in NER_test_loader:
        inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
        outputs, hn = model(inputs)
        outputs = outputs.reshape(-1, outputs.shape[-1])
        labels = labels.reshape(-1)

        _, predicted = torch.max(outputs, 1)
        all_test_labels.extend(labels.cpu().numpy())
        all_test_preds.extend(predicted.cpu().numpy())

test_accuracy = accuracy_score(all_test_labels, all_test_preds)
test_f1 = f1_score(all_test_labels, all_test_preds, average='macro')
print(f"Test accuracy: {test_accuracy} Test F1: {test_f1}")

Test accuracy: 0.9632465277777778 Test F1: 0.2976833335124903


In [55]:
for input, label, sentence in NER_train_loader:
    print(sentence)
    outputs, hn = model(input.to(device).to(torch.float32))
    # print label and prediction
    _, predicted = torch.max(outputs, 2)
    print(label)
    print(predicted)

    print(outputs.shape)
    break
    

('The injuries of Shish Pal (D-3) are as follows: (i) Incised Wounds at (Lt) pareital area of skull at top size 12 cm x 1-1/2 cms x Brain matter deep obliquely & Tailing Backwards. (ii) Multiple Incised Wounds at (Lt) Head Laterally & at Back side. ', 'The learned Counsel for the parties informed that the matter is still not decided by the larger Bench of the Supreme Court. ', 'therefore, she was not entitled to any damages. ', 'Some of those 15 also had injuries on their persons which appeared to be due to lathi charge at village Khabra. ', 'I am unable to subscribe to the Second reasoning given by Piggott J. in AIR 1916 All 51 that if an appeal against a decree can lie in certain circumstances the decree may be regarded as one open to appeal for purposes of Order 43 Rule 1(d). ', 'According to Sri Haranahalli, the complaint as against respondent/accused No. 2 was definitely maintainable and could be proceeded with against him in view of no bar thereto under section 446 of the Compani