In [13]:
# !pip3 install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.1-py3-none-any.whl.metadata (23 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Using cached wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading gensim-4.3.2-cp310-cp310-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading smart_open-7.0.1-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl (38 kB)
Installing collected packages: wrapt, smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-7.0.1 wrapt-1.16.0


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import json
import gensim.downloader as api
from sklearn.metrics import f1_score
import numpy as np



In [18]:
with open('Json Task1/train_processed.json', 'r') as f:
    task1_train_data = json.load(f)

with open('Json Task1/val_processed.json', 'r') as f:
    task1_val_data = json.load(f)

with open('Json Task1/test_processed.json', 'r') as f:
    task1_test_data = json.load(f)

with open('Json Task2/train_processed.json', 'r') as f:
    task2_train_data = json.load(f)

with open('Json Task2/val_processed.json', 'r') as f:
    task2_val_data = json.load(f)

with open('Json Task2/test_processed.json', 'r') as f:
    task2_test_data = json.load(f)

bio_mapping_task1 = {'B_ORG': 0, 'I_ORG': 1, 'B_RESPONDENT': 2, 'I_RESPONDENT': 3, 'B_JUDGE': 4, 'I_JUDGE': 5,
               'B_STATUTE': 6, 'I_STATUTE': 7, 'B_OTHER_PERSON': 8, 'I_OTHER_PERSON': 9, 'B_COURT': 10, 'I_COURT': 11,
               'B_GPE': 12, 'I_GPE': 13, 'B_PETITIONER': 14, 'I_PETITIONER': 15, 'B_WITNESS': 16, 'I_WITNESS': 17,
               'B_CASE_NUMBER': 18, 'I_CASE_NUMBER': 19, 'B_PRECEDENT': 20, 'I_PRECEDENT': 21, 'B_DATE': 22, 'I_DATE': 23,
               'B_PROVISION': 24, 'I_PROVISION': 25, 'O': 26}


In [6]:
word2vec_model = api.load('word2vec-google-news-300')
vocab_size, embedding_dim = word2vec_model.vectors.shape

In [19]:
# Load word2vec model
word2vec = torch.FloatTensor(word2vec_model.vectors)
word2vec_u = torch.concatenate((word2vec, torch.zeros((1, 300))), axis = 0)
print(word2vec_u.shape)

# Add an unknown token to the vocabulary
word_to_index = {word: index for index, word in enumerate(word2vec_model.index_to_key)}
word_to_index['<unk>'] = len(word_to_index)

# Example usage
unknown_token_index = word_to_index['<unk>']

bio_mapping_task1 = {'B_ORG': 0, 'I_ORG': 1, 'B_RESPONDENT': 2, 'I_RESPONDENT': 3, 'B_JUDGE': 4, 'I_JUDGE': 5,
               'B_STATUTE': 6, 'I_STATUTE': 7, 'B_OTHER_PERSON': 8, 'I_OTHER_PERSON': 9, 'B_COURT': 10, 'I_COURT': 11,
               'B_GPE': 12, 'I_GPE': 13, 'B_PETITIONER': 14, 'I_PETITIONER': 15, 'B_WITNESS': 16, 'I_WITNESS': 17,
               'B_CASE_NUMBER': 18, 'I_CASE_NUMBER': 19, 'B_PRECEDENT': 20, 'I_PRECEDENT': 21, 'B_DATE': 22, 'I_DATE': 23,
               'B_PROVISION': 24, 'I_PROVISION': 25, 'O': 26}

torch.Size([3000001, 300])


In [20]:
print(word_to_index['<unk>'])

3000000


In [21]:

class GRUModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_size, pretrained_embedding):
        super(GRUModel, self).__init__()
        self.embedding_layer = nn.Embedding.from_pretrained(pretrained_embedding, freeze=True)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        x = self.embedding_layer(x)
        out, _ = self.gru(x)
        out = self.fc(out)
        return out

class Task_data(Dataset):
    def __init__(self, data, bio_index):
        self.data = data
        self.length = len(self.data)
        self.bio_index =  bio_index

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        input_sequence = self.data[str(index)]['text'].split(' ')
        output_sequence = self.data[str(index)]['labels']
        output_labels = [self.bio_index[word] for word in output_sequence]
        # Map words to indices, use '<unk>' if the word is not in the vocabulary
        input_indices = [word_to_index.get(word, word_to_index['<unk>']) for word in input_sequence]
        return torch.tensor(input_indices), torch.tensor(output_labels)


dataset = Task_data(task1_train_data, bio_mapping_task1)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)





In [22]:
# definig data_loaders
batch_size = 1

task1_train_dataloader = DataLoader(Task_data(task1_train_data, bio_mapping_task1), batch_size=batch_size, shuffle=True)
task1_val_dataloader =  DataLoader(Task_data(task1_val_data, bio_mapping_task1), batch_size=batch_size, shuffle=True)
task1_test_dataloader =  DataLoader(Task_data(task1_test_data, bio_mapping_task1), batch_size=batch_size, shuffle=False)

# task2_train_dataloader =  DataLoader(Task_data(task2_train_data), batch_size=batch_size, shuffle=True)
# task2_val_dataloader = DataLoader(Task_data(task2_val_data), batch_size=batch_size, shuffle=True)
# task2_test_dataloader = DataLoader(Task_data(task2_test_data), batch_size=batch_size, shuffle=False)



In [None]:
task1_val_data

In [24]:
model = GRUModel(embedding_dim, 150, 27, word2vec_u)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# # Training loop
# for epoch in range(2):
#     for batch_idx, (inputs, targets) in enumerate(dataloader):
#         optimizer.zero_grad()
        
#         # Forward pass
#         outputs = model(inputs)
        
#         # Calculate loss at each time step
#         loss = 0
#         for i in range(outputs.size(1)):  # Iterate over time steps
#             loss += criterion(outputs[:, i, :], targets[:, i])  # Apply CrossEntropyLoss at each time step
        
#         # Backward pass
#         loss.backward()
#         optimizer.step()

#         # print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}/{len(dataloader)}, Loss: {loss.item()}")

In [25]:

# Lists to store training and validation losses, and F1-scores
train_losses = []
val_losses = []
train_f1_scores = []
val_f1_scores = []

# Training loop
for epoch in range(20):
    model.train()  # Set the model to training mode
    total_train_loss = 0
    all_train_predictions = []
    all_train_targets = []

    for batch_idx, (inputs, targets) in enumerate(task1_train_dataloader):
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        loss = 0
        for i in range(outputs.size(1)):  # Iterate over time steps
            loss += criterion(outputs[:, i, :], targets[:, i])  # Apply CrossEntropyLoss at each time step
        
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        all_train_predictions.extend(outputs.argmax(dim=2).view(-1).cpu().numpy())
        all_train_targets.extend(targets.view(-1).cpu().numpy())


    avg_train_loss = total_train_loss / len(task1_train_dataloader)
    train_losses.append(avg_train_loss)

    train_macro_f1 = f1_score(all_train_targets, all_train_predictions, average='macro')
    train_f1_scores.append(train_macro_f1)
    print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss}, Training Macro F1-Score: {train_macro_f1}")

    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0
    all_val_predictions = []
    all_val_targets = []

    with torch.no_grad():
        for val_inputs, val_targets in task1_val_dataloader:
            val_outputs = model(val_inputs)

            loss = 0
            for i in range(val_outputs.size(1)):  # Iterate over time steps
                loss += criterion(val_outputs[:, i, :], val_targets[:, i])  

            total_val_loss += loss.item()

            all_val_predictions.extend(val_outputs.argmax(dim=2).view(-1).cpu().numpy())
            all_val_targets.extend(val_targets.view(-1).cpu().numpy())

        avg_val_loss = total_val_loss / len(task1_val_dataloader)
        val_losses.append(avg_val_loss)

        val_macro_f1 = f1_score(all_val_targets, all_val_predictions, average='macro')
        val_f1_scores.append(val_macro_f1)
    print(f"Epoch {epoch + 1},  Validation Loss: {avg_val_loss}, Validation Macro F1-Score: {val_macro_f1}")

Epoch 1, Training Loss: 11.122886175142451, Training Macro F1-Score: 0.3805550984131343
Epoch 1,  Validation Loss: 9.105612883163836, Validation Macro F1-Score: 0.4765792481429911
Epoch 2, Training Loss: 8.024609900388613, Training Macro F1-Score: 0.5350606850431583
Epoch 2,  Validation Loss: 8.55320052684154, Validation Macro F1-Score: 0.5281750765124165
Epoch 3, Training Loss: 6.893542225461287, Training Macro F1-Score: 0.594499112830175
Epoch 3,  Validation Loss: 8.316932249131371, Validation Macro F1-Score: 0.5320429729846511
Epoch 4, Training Loss: 5.94236874133967, Training Macro F1-Score: 0.6429994264489904
Epoch 4,  Validation Loss: 8.391348448453188, Validation Macro F1-Score: 0.5431171053987073
Epoch 5, Training Loss: 5.0368401636362945, Training Macro F1-Score: 0.6930786833000134
Epoch 5,  Validation Loss: 8.770328096085855, Validation Macro F1-Score: 0.5397007484727813
Epoch 6, Training Loss: 4.266086833117684, Training Macro F1-Score: 0.7366900130937079
Epoch 6,  Validatio