# Imports

In [489]:
import torch 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import os
import json
import gensim
import re
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from gensim.models import KeyedVectors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [499]:
with open('../processed/LR_train_tagged.json') as f: 
    LR_train = json.load(f)
    
with open('../processed/LR_test_tagged.json') as f:
    LR_test = json.load(f)
    
with open('../processed/LR_val_tagged.json') as f:
    LR_val = json.load(f)

with open('../processed/NER_train_tagged.json') as f:
    NER_train = json.load(f)

with open('../processed/NER_test_tagged.json') as f:
    NER_test = json.load(f)

with open('../processed/NER_val_tagged.json') as f:
    NER_val = json.load(f)


In [491]:
w2vmodel = KeyedVectors.load_word2vec_format('../.vector_cache/GoogleNews-vectors-negative300.bin', binary=True)

In [503]:
unique_labels = []
for entry in NER_train:
    labels = entry['labels']
    for label in labels:
        unique_labels.append(label)

label_dict = {}
unique_labels = list(set(unique_labels))

for label_index in range(len(unique_labels)):
    label_dict[unique_labels[label_index]] = label_index
    
print(label_dict)

{'O': 0, 'I_STATUTE': 1, 'B_GPE': 2, 'B_CASE_NUMBER': 3, 'B_COURT': 4, 'I_ORG': 5, 'B_STATUTE': 6, 'I_OTHER_PERSON': 7, 'B_DATE': 8, 'I_DATE': 9, 'B_PROVISION': 10, 'I_RESPONDENT': 11, 'I_JUDGE': 12, 'I_PETITIONER': 13, 'I_COURT': 14, 'B_JUDGE': 15, 'I_PRECEDENT': 16, 'B_RESPONDENT': 17, 'I_GPE': 18, 'I_PROVISION': 19, 'B_PRECEDENT': 20, 'B_WITNESS': 21, 'I_CASE_NUMBER': 22, 'B_ORG': 23, 'I_WITNESS': 24, 'B_OTHER_PERSON': 25, 'B_PETITIONER': 26}


In [505]:
maxlen = 0
id = 0
for entry in NER_train:
    if (maxlen < len(entry['text'].split())):
        maxlen = len(entry['text'].split())
        id = entry['id']
print(maxlen)
print(id)

70
dc09913bba844e3c8d920c9df2970988


In [506]:
class NER_dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, embedding="word2vec"):
        self.padding_word = "PAD"
        if (embedding == "word2vec"):
            self.w2v_model = w2vmodel
        self.data = dataset
        self.input = []
        self.labels = []
        for entry in dataset:
            padded_entry = entry["text"].split(" ") + [self.padding_word] * (75- len(entry["text"].split(" ")))
            self.input.append(padded_entry)
            padded_labels = entry["labels"] + ["O"] * (75 - len(entry["labels"]))
            self.labels.append(padded_labels)

        self.tag_to_index = label_dict

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        sentence = self.input[idx]
        word_vecs = []
        labels = []
        for word_index in range(len(sentence)):
            if sentence[word_index] not in self.w2v_model:
                word_vecs.append(np.zeros(300))
                labels.append(self.tag_to_index["O"])
            else:
                word_vecs.append(self.w2v_model[sentence[word_index]])
                labels.append(self.tag_to_index[self.labels[idx][word_index]])

        return torch.tensor(word_vecs), torch.tensor(labels)

In [507]:
NER_train_loader = torch.utils.data.DataLoader(NER_dataset(NER_train), batch_size=64, shuffle=True)

In [508]:
model = torch.nn.RNN(300, 27, 1, batch_first=True)
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [509]:
def train(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    losser = []
    for epoch in range(num_epochs):
        epoch_loss = 0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
            outputs, hn = model(inputs)
            outputs = outputs.reshape(-1, outputs.shape[-1])
            labels = labels.reshape(-1)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()/len(labels)
        print(f"Epoch {epoch} loss: {epoch_loss/len(NER_train_loader)}")
        losser.append(epoch_loss)
        

In [510]:
train(model, NER_train_loader, criterion, optimizer, num_epochs=10)

Epoch 0 loss: 0.0006135675053399311
Epoch 1 loss: 0.0005672790131274124
