In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import json
import gensim.downloader as api
from sklearn.metrics import f1_score
import numpy as np
from gensim.models import KeyedVectors
import pickle

In [2]:
with open('Json Task1/train_processed.json', 'r') as f:
    task1_train_data = json.load(f)

with open('Json Task1/val_processed.json', 'r') as f:
    task1_val_data = json.load(f)

with open('Json Task1/test_processed.json', 'r') as f:
    task1_test_data = json.load(f)

with open('Json Task2/train_processed.json', 'r') as f:
    task2_train_data = json.load(f)

with open('Json Task2/val_processed.json', 'r') as f:
    task2_val_data = json.load(f)

with open('Json Task2/test_processed.json', 'r') as f:
    task2_test_data = json.load(f)

with open('glove_embedding.pkl', 'rb') as pickle_file:
    glove_embeddings = pickle.load(pickle_file)
    
with open('fast_text_embedding.pkl', 'rb') as pickle_file:
    fast_text_embedding = pickle.load(pickle_file)

bio_mapping_task1 = {'B_ORG': 0, 'I_ORG': 1, 'B_RESPONDENT': 2, 'I_RESPONDENT': 3, 'B_JUDGE': 4, 'I_JUDGE': 5,
               'B_STATUTE': 6, 'I_STATUTE': 7, 'B_OTHER_PERSON': 8, 'I_OTHER_PERSON': 9, 'B_COURT': 10, 'I_COURT': 11,
               'B_GPE': 12, 'I_GPE': 13, 'B_PETITIONER': 14, 'I_PETITIONER': 15, 'B_WITNESS': 16, 'I_WITNESS': 17,
               'B_CASE_NUMBER': 18, 'I_CASE_NUMBER': 19, 'B_PRECEDENT': 20, 'I_PRECEDENT': 21, 'B_DATE': 22, 'I_DATE': 23,
               'B_PROVISION': 24, 'I_PROVISION': 25, 'O': 26}
bio_mapping_task2 = {'O' : 0, 'I' : 1, 'B' : 2}

In [3]:
word2vec_model = api.load('word2vec-google-news-300')


In [4]:

word2vec = torch.FloatTensor(word2vec_model.vectors)
word2vec_u_ = np.concatenate((word2vec, np.zeros((1, 300), dtype = 'float32')), axis = 0)

# Add an unknown token to the vocabulary
word_to_index = {word: index for index, word in enumerate(word2vec_model.index_to_key)}
word_to_index['<unk>'] = len(word_to_index)

# Example usage
unknown_token_index = word_to_index['<unk>']

In [11]:

class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, output_size):
        super(LSTMModel, self).__init__()
        # self.embedding_layer = nn.Embedding.from_pretrained(pretrained_embedding, freeze=True)
        self.lstm = nn.LSTM(embedding_dim, 128, num_layers=2, batch_first=True)
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, output_size)

    def forward(self, x):
        # x = self.embedding_layer(x)
        out, _ = self.lstm(x)
        out = self.fc1(out)
        out = self.fc2(out)
        return out
    
class Task_data(Dataset):
    def __init__(self, data, bio_index, embedding_type):
        self.data = data
        self.length = len(self.data)
        self.bio_index =  bio_index
        self.embedding_type = embedding_type

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        input_sequence = self.data[str(index)]['text'].split(' ')
        sentence_embeddings = []
        if self.embedding_type == "glove":
            sentence_embeddings = [glove_embeddings.get(word, np.zeros(300, dtype = 'float32')) for word in input_sequence]
        elif self.embedding_type == "word2vec":
            sentence_embeddings = [word2vec_u_[word_to_index.get(word, word_to_index['<unk>'])] for word in input_sequence]
        elif self.embedding_type == "fast_text":
            sentence_embeddings = [np.array(fast_text_embedding.get(word, np.zeros(300, dtype = 'float32')), dtype = 'float32') for word in input_sequence]
        
        sentence_embeddings  = np.array(sentence_embeddings, dtype = 'float32')
        output_sequence = self.data[str(index)]['labels']
        output_labels = [self.bio_index[word] for word in output_sequence]

        return torch.tensor(sentence_embeddings), torch.tensor(output_labels)

In [12]:
def train_model(task, embedding_type, model, optimizer, criterion, device,  num_epochs = 20, batch_size = 1):
    train_dataloader  = None
    val_dataloader  = None
    test_dataloader = None

    if task == 1:
        train_dataloader = DataLoader(Task_data(task1_train_data, bio_mapping_task1, embedding_type), batch_size=batch_size, shuffle=True)
        val_dataloader =  DataLoader(Task_data(task1_val_data, bio_mapping_task1, embedding_type), batch_size=batch_size, shuffle=True)
    elif task == 2:
        train_dataloader = DataLoader(Task_data(task2_train_data, bio_mapping_task2, embedding_type), batch_size=batch_size, shuffle=True)
        val_dataloader =  DataLoader(Task_data(task2_val_data, bio_mapping_task2, embedding_type), batch_size=batch_size, shuffle=True)


    train_losses = []
    val_losses = []
    train_f1_scores = []
    val_f1_scores = []

    # Training loop
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        total_train_loss = 0
        all_train_predictions = []
        all_train_targets = []

        for batch_idx, (inputs, targets) in enumerate(train_dataloader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            
            loss = 0
            for i in range(outputs.size(1)):  # Iterate over time steps
                loss += criterion(outputs[:, i, :], targets[:, i])  # Apply CrossEntropyLoss at each time step
            
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            all_train_predictions.extend(outputs.argmax(dim=2).view(-1).cpu().numpy())
            all_train_targets.extend(targets.view(-1).cpu().numpy())


        avg_train_loss = total_train_loss / len(train_dataloader)
        train_losses.append(avg_train_loss)

        train_macro_f1 = f1_score(all_train_targets, all_train_predictions, average='macro')
        train_f1_scores.append(train_macro_f1)
        print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss}, Training Macro F1-Score: {train_macro_f1}")

        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        all_val_predictions = []
        all_val_targets = []

        with torch.no_grad():
            for val_inputs, val_targets in val_dataloader:
                val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)
                val_outputs = model(val_inputs)

                loss = 0
                for i in range(val_outputs.size(1)):  # Iterate over time steps
                    loss += criterion(val_outputs[:, i, :], val_targets[:, i])  

                total_val_loss += loss.item()

                all_val_predictions.extend(val_outputs.argmax(dim=2).view(-1).cpu().numpy())
                all_val_targets.extend(val_targets.view(-1).cpu().numpy())

            avg_val_loss = total_val_loss / len(val_dataloader)
            val_losses.append(avg_val_loss)

            val_macro_f1 = f1_score(all_val_targets, all_val_predictions, average='macro')
            val_f1_scores.append(val_macro_f1)
        print(f"Epoch {epoch + 1},  Validation Loss: {avg_val_loss}, Validation Macro F1-Score: {val_macro_f1}")

    return train_losses, train_f1_scores, val_losses, val_f1_scores

def test_model(task, embedding_type, model, criterion, device, batch_size = 1):
    test_dataloader = None
    if task == 1:
        test_dataloader =  DataLoader(Task_data(task1_test_data, bio_mapping_task1, embedding_type), batch_size=batch_size, shuffle=False)

    elif task == 2:
        test_dataloader =  DataLoader(Task_data(task2_test_data, bio_mapping_task2, embedding_type), batch_size=batch_size, shuffle=False)

    total_test_loss = 0
    all_test_predictions = []
    all_test_targets = []

    with torch.no_grad():
        for test_inputs, test_targets in test_dataloader:
            test_inputs, test_targets = test_inputs.to(device), test_targets.to(device)
            test_outputs = model(test_inputs)

            loss = 0
            for i in range(test_outputs.size(1)):  # Iterate over time steps
                loss += criterion(test_outputs[:, i, :], test_targets[:, i])  

            total_test_loss += loss.item()

            all_test_predictions.extend(test_outputs.argmax(dim=2).view(-1).cpu().numpy())
            all_test_targets.extend(test_targets.view(-1).cpu().numpy())

        avg_test_loss = total_test_loss / len(test_dataloader)
        test_macro_f1 = f1_score(all_test_targets, all_test_predictions, average='macro')
    print(f'Test Loss: {avg_test_loss}, Test Macro F1-Score: {test_macro_f1}')

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = LSTMModel(300, 27).to(device) 
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

train_model(task = 1, embedding_type = "glove", model = model, optimizer=optimizer, criterion=criterion, device=device)
test_model(task = 1, embedding_type = "glove", model = model, criterion=criterion, device=device)

  from .autonotebook import tqdm as notebook_tqdm


Epoch 1, Training Loss: 17.130694535091603, Training Macro F1-Score: 0.12820129134817448
Epoch 1,  Validation Loss: 12.808093923542222, Validation Macro F1-Score: 0.23136399535322597
Epoch 2, Training Loss: 10.39012437543791, Training Macro F1-Score: 0.3214590342947403
Epoch 2,  Validation Loss: 9.989640004081798, Validation Macro F1-Score: 0.3568401825623714
Epoch 3, Training Loss: 8.615191780202105, Training Macro F1-Score: 0.3884207227845056
Epoch 3,  Validation Loss: 8.759110071452096, Validation Macro F1-Score: 0.3897291836480331
Epoch 4, Training Loss: 7.679853583662675, Training Macro F1-Score: 0.42256363502159916


In [None]:
model = LSTMModel(300, 27).to(device) 
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
train_model(task = 1, embedding_type = "word2vec", model = model, optimizer=optimizer, criterion=criterion, device=device)
test_model(task = 1, embedding_type = "word2vec", model = model, criterion=criterion, device=device)

In [None]:
model = LSTMModel(300, 27).to(device) 
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
train_model(task = 1, embedding_type = "fast_text", model = model, optimizer=optimizer, criterion=criterion, device=device)
test_model(task = 1, embedding_type = "fast_text", model = model, criterion=criterion, device=device)

In [None]:
model = LSTMModel(300, 27).to(device) 
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
train_model(task = 2, embedding_type = "fast_text", model = model, optimizer=optimizer, criterion=criterion, device=device)
test_model(task = 2, embedding_type = "fast_text", model = model, criterion=criterion, device=device)

In [None]:
model = LSTMModel(300, 27).to(device) 
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
train_model(task = 2, embedding_type = "word2vec", model = model, optimizer=optimizer, criterion=criterion, device=device)
test_model(task = 2, embedding_type = "word2vec", model = model, criterion=criterion, device=device)

In [None]:
model = LSTMModel(300, 27).to(device) 
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

train_model(task = 2, embedding_type = "glove", model = model, optimizer=optimizer, criterion=criterion, device=device)
test_model(task = 2, embedding_type = "glove", model = model, criterion=criterion, device=device)