In [73]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import json
import gensim.downloader as api
from sklearn.metrics import f1_score
import numpy as np

In [74]:
with open('Json Task1/train_processed.json', 'r') as f:
    task1_train_data = json.load(f)

with open('Json Task1/val_processed.json', 'r') as f:
    task1_val_data = json.load(f)

with open('Json Task1/test_processed.json', 'r') as f:
    task1_test_data = json.load(f)

with open('Json Task2/train_processed.json', 'r') as f:
    task2_train_data = json.load(f)

with open('Json Task2/val_processed.json', 'r') as f:
    task2_val_data = json.load(f)

with open('Json Task2/test_processed.json', 'r') as f:
    task2_test_data = json.load(f)

bio_mapping_task1 = {'B_ORG': 0, 'I_ORG': 1, 'B_RESPONDENT': 2, 'I_RESPONDENT': 3, 'B_JUDGE': 4, 'I_JUDGE': 5,
               'B_STATUTE': 6, 'I_STATUTE': 7, 'B_OTHER_PERSON': 8, 'I_OTHER_PERSON': 9, 'B_COURT': 10, 'I_COURT': 11,
               'B_GPE': 12, 'I_GPE': 13, 'B_PETITIONER': 14, 'I_PETITIONER': 15, 'B_WITNESS': 16, 'I_WITNESS': 17,
               'B_CASE_NUMBER': 18, 'I_CASE_NUMBER': 19, 'B_PRECEDENT': 20, 'I_PRECEDENT': 21, 'B_DATE': 22, 'I_DATE': 23,
               'B_PROVISION': 24, 'I_PROVISION': 25, 'O': 26}
bio_mapping_task2 = {'O' : 0, 'I' : 1, 'B' : 2}

In [None]:
word2vec_model = api.load('word2vec-google-news-300')


In [75]:
type(word2vec_model.vectors[0][0])

numpy.float32

In [65]:

word2vec = torch.FloatTensor(word2vec_model.vectors)
# word2vec_u = torch.concatenate((word2vec, torch.zeros((1, 300), dtype = torch.float32)), axis = 0)
# print(word2vec_u.shape)
word2vec_u_ = np.concatenate((word2vec, np.zeros((1, 300), dtype = 'float32')), axis = 0)

# Add an unknown token to the vocabulary
word_to_index = {word: index for index, word in enumerate(word2vec_model.index_to_key)}
word_to_index['<unk>'] = len(word_to_index)

# Example usage
unknown_token_index = word_to_index['<unk>']

In [68]:
input_sequence = task1_train_data[str(0)]['text'].split(' ')
sentence_embeddings = []
sentence_embeddings = [word2vec_u_[word_to_index.get(word, word_to_index['<unk>'])] for word in input_sequence]
print(type(sentence_embeddings), type(sentence_embeddings[0][0]))
print(torch.tensor(sentence_embeddings))

<class 'list'> <class 'numpy.float32'>
tensor([[-0.0747, -0.0840,  0.1045,  ..., -0.0152, -0.0417, -0.0376],
        [ 0.0232, -0.1245,  0.0645,  ...,  0.1445,  0.2217,  0.0481],
        [-0.1738, -0.1846,  0.0199,  ..., -0.1709, -0.0233, -0.2656],
        ...,
        [ 0.1074,  0.0127,  0.3281,  ..., -0.0175, -0.1455, -0.1729],
        [ 0.1060,  0.0204,  0.2617,  ..., -0.3066,  0.1094, -0.1650],
        [ 0.0085,  0.2070,  0.0045,  ..., -0.0080,  0.2451, -0.2383]])


In [72]:
task2_test_data

{'0': {'text': ['Boot',
   'time',
   'is',
   'super',
   'fast',
   ',',
   'around',
   'anywhere',
   'from',
   '35',
   'seconds',
   'to',
   '1',
   'minute',
   '.'],
  'labels': ['B',
   'I',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']},
 '1': {'text': ['tech',
   'support',
   'would',
   'not',
   'fix',
   'the',
   'problem',
   'unless',
   'I',
   'bought',
   'your',
   'plan',
   'for',
   '$',
   '150',
   'plus',
   '.'],
  'labels': ['B',
   'I',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']},
 '2': {'text': ['Set', 'up', 'was', 'easy', '.'],
  'labels': ['B', 'I', 'O', 'O', 'O']},
 '3': {'text': ['Did',
   'not',
   'enjoy',
   'the',
   'new',
   'Windows',
   '8',
   'and',
   'touchscreen',
   'functions',
   '.'],
  'labels': ['O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'B', 'I', 'O']},
 '4': {'text': ['Other',
   'than',
   'not',


In [None]:
def load_glove_embeddings(file_path):
    word_embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                values = line.split()
                word = values[0]
                vector = np.array(values[1:], dtype='float32')
                word_embeddings[word] = vector
            except ValueError as e:
                print(f"Error processing line: {line}\nError: {e}")
    return word_embeddings

glove_file_path = 'glove.840B.300d.txt'  # Adjust the path based on your downloaded file
glove_embeddings = load_glove_embeddings(glove_file_path)

In [13]:
# print(glove_embeddings['courts'].shape)

(300,)


In [69]:

class GRUModel(nn.Module):
    def __init__(self, embedding_dim, output_size):
        super(GRUModel, self).__init__()
        # self.embedding_layer = nn.Embedding.from_pretrained(pretrained_embedding, freeze=True)
        self.gru = nn.GRU(embedding_dim, 128, num_layers=2, batch_first=True)
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, output_size)

    def forward(self, x):
        # x = self.embedding_layer(x)
        out, _ = self.gru(x)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

class Task_data(Dataset):
    def __init__(self, data, bio_index, embedding_type):
        self.data = data
        self.length = len(self.data)
        self.bio_index =  bio_index
        self.embedding_type = embedding_type

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        input_sequence = self.data[str(index)]['text'].split(' ')
        sentence_embeddings = []
        if self.embedding_type == "glove":
            sentence_embeddings = [glove_embeddings.get(word, np.zeros(300, dtype = 'float32')) for word in input_sequence]
        elif self.embedding_type == "word2vec":
            sentence_embeddings = [word2vec_u_[word_to_index.get(word, word_to_index['<unk>'])] for word in input_sequence]

        output_sequence = self.data[str(index)]['labels']
        output_labels = [self.bio_index[word] for word in output_sequence]
        # Map words to indices, use '<unk>' if the word is not in the vocabulary
        # input_indices = [word_to_index.get(word, word_to_index['<unk>']) for word in input_sequence]
        # return torch.tensor(input_indices), torch.tensor(output_labels)
        return torch.tensor(sentence_embeddings), torch.tensor(output_labels)







In [29]:
# sentence = "example sentence"
# sentence_words = sentence.split()
# sentence_embeddings = [glove_embeddings.get(word, np.zeros(50)) for word in sentence_words]


In [30]:
# # definig data_loaders
# batch_size = 1

# task1_train_dataloader = DataLoader(Task_data(task1_train_data, bio_mapping_task1, glove_embeddings), batch_size=batch_size, shuffle=True)
# task1_val_dataloader =  DataLoader(Task_data(task1_val_data, bio_mapping_task1, glove_embeddings), batch_size=batch_size, shuffle=True)
# task1_test_dataloader =  DataLoader(Task_data(task1_test_data, bio_mapping_task1, glove_embeddings), batch_size=batch_size, shuffle=False)



In [42]:
def train_model(task, embedding_type, model, optimizer, criterion, device,  num_epochs = 12, batch_size = 1):
    train_dataloader  = None
    val_dataloader  = None
    test_dataloader = None

    if task == 1:
        train_dataloader = DataLoader(Task_data(task1_train_data, bio_mapping_task1, embedding_type), batch_size=batch_size, shuffle=True)
        val_dataloader =  DataLoader(Task_data(task1_val_data, bio_mapping_task1, embedding_type), batch_size=batch_size, shuffle=True)
    elif task == 2:
        train_dataloader = DataLoader(Task_data(task2_train_data, bio_mapping_task2, embedding_type), batch_size=batch_size, shuffle=True)
        val_dataloader =  DataLoader(Task_data(task2_val_data, bio_mapping_task2, embedding_type), batch_size=batch_size, shuffle=True)


    train_losses = []
    val_losses = []
    train_f1_scores = []
    val_f1_scores = []

    # Training loop
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        total_train_loss = 0
        all_train_predictions = []
        all_train_targets = []

        for batch_idx, (inputs, targets) in enumerate(train_dataloader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            
            loss = 0
            for i in range(outputs.size(1)):  # Iterate over time steps
                loss += criterion(outputs[:, i, :], targets[:, i])  # Apply CrossEntropyLoss at each time step
            
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            all_train_predictions.extend(outputs.argmax(dim=2).view(-1).cpu().numpy())
            all_train_targets.extend(targets.view(-1).cpu().numpy())


        avg_train_loss = total_train_loss / len(train_dataloader)
        train_losses.append(avg_train_loss)

        train_macro_f1 = f1_score(all_train_targets, all_train_predictions, average='macro')
        train_f1_scores.append(train_macro_f1)
        print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss}, Training Macro F1-Score: {train_macro_f1}")

        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        all_val_predictions = []
        all_val_targets = []

        with torch.no_grad():
            for val_inputs, val_targets in val_dataloader:
                val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)
                val_outputs = model(val_inputs)

                loss = 0
                for i in range(val_outputs.size(1)):  # Iterate over time steps
                    loss += criterion(val_outputs[:, i, :], val_targets[:, i])  

                total_val_loss += loss.item()

                all_val_predictions.extend(val_outputs.argmax(dim=2).view(-1).cpu().numpy())
                all_val_targets.extend(val_targets.view(-1).cpu().numpy())

            avg_val_loss = total_val_loss / len(val_dataloader)
            val_losses.append(avg_val_loss)

            val_macro_f1 = f1_score(all_val_targets, all_val_predictions, average='macro')
            val_f1_scores.append(val_macro_f1)
        print(f"Epoch {epoch + 1},  Validation Loss: {avg_val_loss}, Validation Macro F1-Score: {val_macro_f1}")

    return train_losses, train_f1_scores, val_losses, val_f1_scores

def test_model(task, embedding_type, model, criterion, device, batch_size = 1):
    test_dataloader = None
    if task == 1:
        test_dataloader =  DataLoader(Task_data(task1_test_data, bio_mapping_task1, embedding_type), batch_size=batch_size, shuffle=False)

    elif task == 2:
        test_dataloader =  DataLoader(Task_data(task2_test_data, bio_mapping_task2, embedding_type), batch_size=batch_size, shuffle=False)

    total_test_loss = 0
    all_test_predictions = []
    all_test_targets = []

    with torch.no_grad():
        for test_inputs, test_targets in test_dataloader:
            test_inputs, test_targets = test_inputs.to(device), test_targets.to(device)
            test_outputs = model(test_inputs)

            loss = 0
            for i in range(test_outputs.size(1)):  # Iterate over time steps
                loss += criterion(test_outputs[:, i, :], test_targets[:, i])  

            total_test_loss += loss.item()

            all_test_predictions.extend(test_outputs.argmax(dim=2).view(-1).cpu().numpy())
            all_test_targets.extend(test_targets.view(-1).cpu().numpy())

        avg_test_loss = total_test_loss / len(test_dataloader)
        test_macro_f1 = f1_score(all_test_targets, all_test_predictions, average='macro')
    print(f'Test Loss: {avg_test_loss}, Test Macro F1-Score: {test_macro_f1}')

In [77]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = GRUModel(300, 27).to(device) 
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

train_model(task = 2, embedding_type = "glove", model = model, optimizer=optimizer, criterion=criterion, device=device)
test_model(task = 2, embedding_type = "glove", model = model, criterion=criterion, device=device)

Epoch 1, Training Loss: 11.18388811049872, Training Macro F1-Score: 0.1489009838022129
Epoch 1,  Validation Loss: 4.641801144978771, Validation Macro F1-Score: 0.4457873729927359
Epoch 2, Training Loss: 4.315453339968427, Training Macro F1-Score: 0.3775279413129583
Epoch 2,  Validation Loss: 3.3646143666005024, Validation Macro F1-Score: 0.763067472837545
Epoch 3, Training Loss: 3.4432431140515716, Training Macro F1-Score: 0.5616301337461245
Epoch 3,  Validation Loss: 3.0000017153345833, Validation Macro F1-Score: 0.7827545231237917
Epoch 4, Training Loss: 3.021364282931779, Training Macro F1-Score: 0.5856331949713223
Epoch 4,  Validation Loss: 2.8197891059327342, Validation Macro F1-Score: 0.8058890899699366
Epoch 5, Training Loss: 2.7385635526016125, Training Macro F1-Score: 0.5980734309837004
Epoch 5,  Validation Loss: 2.817367051153967, Validation Macro F1-Score: 0.8123016551790622
Epoch 6, Training Loss: 2.5119185854144246, Training Macro F1-Score: 0.8216994754156088
Epoch 6,  Val