# Imports

In [31]:
import torch 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import os
import json
import gensim
import re
import torch.optim as optim
from torchtext.vocab import GloVe
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score
from gensim.models import KeyedVectors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
with open('../processed/LR_train_tagged.json') as f: 
    LR_train = json.load(f)
    
with open('../processed/LR_test_tagged.json') as f:
    LR_test = json.load(f)
    
with open('../processed/LR_val_tagged.json') as f:
    LR_val = json.load(f)

with open('../processed/NER_train_tagged.json') as f:
    NER_train = json.load(f)

with open('../processed/NER_test_tagged.json') as f:
    NER_test = json.load(f)

with open('../processed/NER_val_tagged.json') as f:
    NER_val = json.load(f)


In [3]:
w2vmodel = KeyedVectors.load_word2vec_format('../.vector_cache/GoogleNews-vectors-negative300.bin', binary=True)
glovemodel = GloVe(name='6B', dim=300)

In [8]:
unique_labels = []
for entry in NER_train:
    labels = entry['labels']
    for label in labels:
        unique_labels.append(label)

label_dict = {}
unique_labels = list(set(unique_labels))

for label_index in range(len(unique_labels)):
    label_dict[unique_labels[label_index]] = label_index
    
# print(label_dict)

{'B_PRECEDENT': 0, 'I_PROVISION': 1, 'B_PROVISION': 2, 'B_ORG': 3, 'I_STATUTE': 4, 'B_DATE': 5, 'I_WITNESS': 6, 'I_DATE': 7, 'B_STATUTE': 8, 'I_PRECEDENT': 9, 'I_PETITIONER': 10, 'B_RESPONDENT': 11, 'I_ORG': 12, 'B_CASE_NUMBER': 13, 'I_OTHER_PERSON': 14, 'B_GPE': 15, 'B_COURT': 16, 'O': 17, 'B_WITNESS': 18, 'I_COURT': 19, 'I_CASE_NUMBER': 20, 'I_GPE': 21, 'I_JUDGE': 22, 'I_RESPONDENT': 23, 'B_JUDGE': 24, 'B_PETITIONER': 25, 'B_OTHER_PERSON': 26}


In [505]:
maxlen = 0
id = 0
for entry in NER_train:
    if (maxlen < len(entry['text'].split())):
        maxlen = len(entry['text'].split())
        id = entry['id']
print(maxlen)
print(id)

70
dc09913bba844e3c8d920c9df2970988


In [37]:
class NER_dataset(Dataset):
    def __init__(self, dataset, embedding="word2vec"):
        self.padding_word = "PAD"
        if embedding == "word2vec":
            self.model = w2vmodel
        elif embedding == "glove":
            self.model = glovemodel
        self.data = dataset
        self.input = []
        self.labels = []
        for entry in dataset:
            padded_entry = entry["text"].split() + [self.padding_word] * (75- len(entry["text"].split()))
            self.input.append(padded_entry)
            padded_labels = entry["labels"] + ["O"] * (75 - len(entry["labels"]))
            self.labels.append(padded_labels)

        self.tag_to_index = label_dict

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        sentence = self.input[idx]
        word_vecs = []
        labels = []
        for word_index in range(len(sentence)):
            if sentence[word_index] not in self.model:
                word_vecs.append(np.zeros(300))
                labels.append(self.tag_to_index["O"])
            else:
                word_vecs.append(self.model[sentence[word_index]])
                labels.append(self.tag_to_index[self.labels[idx][word_index]])

        return torch.tensor(word_vecs), torch.tensor(labels)

In [38]:
NER_train_loader = DataLoader(NER_dataset(NER_train, "word2vec"), batch_size=64, shuffle=True)
NER_val_loader = DataLoader(NER_dataset(NER_val, "word2vec"), batch_size=64, shuffle=True)
NER_test_loader = DataLoader(NER_dataset(NER_test, "word2vec"), batch_size=64, shuffle=True)

In [39]:
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):

    train_losses = []
    val_losses = []
    train_f1_scores = []
    val_f1_scores = []
    train_accuracy_scores = []
    val_accuracy_scores = []

    for epoch in range(num_epochs):
        train_loss = 0.0
        val_loss = 0.0
        all_train_labels = []
        all_train_preds = []
        all_val_labels = []
        all_val_preds = []

        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
            outputs, hn = model(inputs)
            outputs = outputs.reshape(-1, outputs.shape[-1])
            labels = labels.reshape(-1)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            all_train_labels.extend(labels.cpu().numpy())
            all_train_preds.extend(predicted.cpu().numpy())

        train_accuracy = accuracy_score(all_train_labels, all_train_preds)
        train_f1 = f1_score(all_train_labels, all_train_preds, average='macro')
        train_accuracy_scores.append(train_accuracy)
        train_f1_scores.append(train_f1)

        model.eval()
        with torch.no_grad():

            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
                outputs, hn = model(inputs)
                outputs = outputs.reshape(-1, outputs.shape[-1])
                labels = labels.reshape(-1)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(predicted.cpu().numpy())

            val_accuracy = accuracy_score(all_val_labels, all_val_preds)
            val_f1 = f1_score(all_val_labels, all_val_preds, average='macro')
            val_accuracy_scores.append(val_accuracy)
            val_f1_scores.append(val_f1)

        train_losses.append(train_loss/len(train_loader))
        val_losses.append(val_loss/len(val_loader))
        print(f"Epoch {epoch + 1}\n"
              f"Train loss: {train_losses[-1]} Val loss: {val_losses[-1]}\n"
              f"Train accuracy: {train_accuracy} Val accuracy: {val_accuracy}\n"
              f"Train F1: {train_f1} Val F1: {val_f1}")
        print("=====================================================================================================")
        

In [40]:
# model = torch.nn.RNN(300, 27, 1, batch_first=True)
model = torch.nn.LSTM(300, 27, 1, batch_first=True, bidirectional=True)
# model = torch.nn.GRU(300, 27, 1, batch_first=True, bidirectional=True)
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [41]:
train(model, NER_train_loader, NER_val_loader, criterion, optimizer, num_epochs=10)

Epoch 1
Train loss: 3.728897319899665 Val loss: 3.6773921925088633
Train accuracy: 0.4803125909298749 Val accuracy: 0.5821280602636535
Train F1: 0.013192550859904392 Val F1: 0.016300926947764298
Epoch 2
Train loss: 3.643454010524447 Val loss: 3.6110207930855127
Train accuracy: 0.6081689321195494 Val accuracy: 0.6437099811676082
Train F1: 0.015126597213781916 Val F1: 0.01697252987537214
Epoch 3
Train loss: 3.5818128756114413 Val loss: 3.552329954893693
Train accuracy: 0.6771002203100969 Val accuracy: 0.7168926553672317
Train F1: 0.016131097270643425 Val F1: 0.018641612772841133
Epoch 4
Train loss: 3.530968427658081 Val loss: 3.5069762209187383
Train accuracy: 0.7579099638358897 Val accuracy: 0.7981826741996233
Train F1: 0.017033437198460795 Val F1: 0.01965104014940729
Epoch 5
Train loss: 3.48833039450267 Val loss: 3.4665198222450586
Train accuracy: 0.829320364135179 Val accuracy: 0.8561016949152542
Train F1: 0.018020016625602566 Val F1: 0.020512165403590365
Epoch 6
Train loss: 3.4525413

In [42]:
model.eval()
all_test_labels = []
all_test_preds = []
with torch.no_grad():
    for inputs, labels in NER_test_loader:
        inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
        outputs, hn = model(inputs)
        outputs = outputs.reshape(-1, outputs.shape[-1])
        labels = labels.reshape(-1)

        _, predicted = torch.max(outputs, 1)
        all_test_labels.extend(labels.cpu().numpy())
        all_test_preds.extend(predicted.cpu().numpy())

test_accuracy = accuracy_score(all_test_labels, all_test_preds)
test_f1 = f1_score(all_test_labels, all_test_preds, average='macro')
print(f"Test accuracy: {test_accuracy} Test F1: {test_f1}")

Test accuracy: 0.9339936775553214 Test F1: 0.023070996491301135
