### NER MODEL

In [1]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [None]:
from typing import Tuple, List, Dict, Any

import pandas as pd
import random
import re
import os
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

from collections import Counter

from gensim.models.keyedvectors import load_word2vec_format

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter
from torch.jit import RecursiveScriptModule

SEED = 2222

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
# Load embeddings model to use in the assignment
w2v_model = load_word2vec_format("./GoogleNews-vectors-negative300.bin.gz", binary = True)

In [None]:
def load_data(file_path: str) -> Tuple[List[List[str]], List[int]]:
    """
    Load data from a specified file path, extract texts and targets, and tokenize the texts using the tokenize_tweet function.

    Parameters:
    file_path (str): The path to the dataset file.

    Returns:
    Tuple[List[str], List[int]]: Lists of texts and corresponding targets.
    """
    df = pd.read_csv(file_path)
    
    sentences = []
    sentence_labels = []
    current_sentence = []
    current_labels = []
    current_sent_idx = None

    for _, row in df.iterrows():
        word = row['gold_token']
        label = row['gold_label']
        sent_idx = row['sent_idx']

        if sent_idx != current_sent_idx:
            if current_sentence:
                sentences.append(current_sentence)
                sentence_labels.append(current_labels)
            current_sentence = []
            current_labels = []
            current_sent_idx = sent_idx

        current_sentence.append(word)
        current_labels.append(label)

    # Add the last sentence if it is not empty
    if current_sentence:
        sentences.append(current_sentence)
        sentence_labels.append(current_labels)

    return sentences, sentence_labels

In [5]:
# Load datasets
tr_texts, tr_targets = load_data('finer_ord_train.csv')
vl_texts, vl_targets = load_data('finer_ord_validation.csv')
ts_texts, ts_targets = load_data('finer_ord_test.csv')

training_data = pd.DataFrame({'Tweets': tr_texts, 'Labels': tr_targets})
training_data[:4]

Unnamed: 0,Tweets,Labels
0,"[Kenyan, Firms, Eye, Deals, During, Obama, Sum...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[By, Neville, Otuki, Kenya, 's, business, lead...","[0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[Industrialists, ,, entrepreneurs, and, banker...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[More, on, This, Kenya, :, Mombasa, Road, ,, U...","[0, 0, 0, 3, 0, 3, 4, 0, 3, 4, 0, 3, 4, 4, 4, ..."


In [None]:
class NERDataset(Dataset):
    """
    A PyTorch Dataset for the dataset.

    Attributes:
        texts (List[List[str]]): List of se tokens.
        targets (List[List[int]]): List of target labels.
    """

    def __init__(self,
                 texts: List[List[str]],
                 targets: List[List[int]]
                 ):
        """
        Initializes the NERDataset with the given file path.

        Args:
            texts (List[List[str]]): List of tweets tokens.
            targets (List[List[int]]): List of target labels.
        """
        self.texts = texts
        self.targets = targets

    def __len__(self) -> int:
        """Returns the length of the dataset."""
        return len(self.targets)

    def __getitem__(self, idx: int) -> Tuple[List[List[str]], List[int]]:
        """
        Returns the embedded tensor and target for the text at the specified index.

        Args:
            idx (int): Index of the item.

        Returns:
            Tuple[List[str], List[int]]: A tuple containing the texts and the target label for idx.
        """
        return self.texts[idx], self.targets[idx]

In [7]:
# Create datasets for models
tr_dataset = NERDataset(tr_texts, tr_targets)
vl_dataset = NERDataset(vl_texts, vl_targets)
ts_dataset = NERDataset(ts_texts, ts_targets)

In [None]:
def word2idx(embedding_model: Any, sentence: List[str], labels:List[int]) -> torch.Tensor:
    """
    Converts a sentence to a list of word indices based on an embedding model.

    This function iterates through each word in the sentence and retrieves its corresponding index
    from the embedding model's vocabulary. If a word is not present in the model's vocabulary,
    it is skipped.

    Args:
        embedding_model (Any): The embedding model with a 'key_to_index' attribute, which maps words to their indices.
        sentence (List[str]): A list of words representing the sentence.

    Returns:
        torch.Tensor: A tensor of word indices corresponding to the words in the tweet.
    """
    index = []
    labels_out = [] # We should eliminate labels from the words that do not exist in w2v
    for word, label in zip(sentence, labels):
        if word in embedding_model.key_to_index:
            index.append(embedding_model.key_to_index.get(word))
            labels_out.append(label)
    
    return torch.tensor(index, dtype=torch.long), torch.tensor(labels_out, dtype=torch.long)

In [9]:
def calculate_accuracy(model: torch.nn.Module, dataloader: DataLoader, threshold: float = 0.5, device: str = 'cpu') -> float:
    """
    Calculate the accuracy of a PyTorch model given a DataLoader.

    The function moves the model to the specified device, sets it to evaluation mode, and computes
    the accuracy by comparing the model's predictions against the true labels. The predictions are
    determined based on a specified threshold.

    Args:
        model (torch.nn.Module): The PyTorch model to evaluate.
        dataloader (DataLoader): The DataLoader containing the dataset to evaluate against.
        threshold (float, optional): Probability threshold to predict a sample as positive. Defaults to 0.5.
        device (str, optional): Device to which the model and data are moved ('cpu' or 'cuda'). Defaults to 'cpu'.

    Returns:
        float: The accuracy of the model on the given dataset.
    """
    model.to(device)
    model.eval()
    correct_predictions = 0
    total_tokens = 0

    with torch.no_grad():
        for features, labels, lenghts in dataloader:
            features = features.to(device)
            labels = labels.to(device)
            output = model(features, torch.Tensor(lenghts))

            # output shape: (batch_size, seq_len, num_classes)
            # predictions shape: (batch_size, seq_len)
            predictions = torch.argmax(torch.softmax(output, dim=2), dim=2)

            # mask for valid tokens (label != -1)
            mask = labels != -1

            correct_predictions += (predictions[mask] == labels[mask]).sum().item()
            total_tokens += mask.sum().item()

    accuracy = correct_predictions / total_tokens if total_tokens > 0 else 0.0
    return accuracy

In [None]:
def collate_fn(batch: List[Tuple[List[str], int]]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Prepares and returns a batch for training/testing in a torch model.

    This function sorts the batch by the length of the text sequences in descending order,
    tokenizes the text using a pre-defined word-to-index mapping, pads the sequences to have
    uniform length, and converts labels to tensor.

    Args:
        batch (List[Tuple[List[str], int]]): A list of tuples, where each tuple contains a
                                             list of words (representing a text) and an integer label.

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple containing three elements:
            - texts_padded (torch.Tensor): A tensor of padded word indices of the text.
            - labels (torch.Tensor): A tensor of labels.
            - lengths (torch.Tensor): A tensor representing the lengths of each text sequence.
    """
    # Sort the batch by the length of text sequences in descending order
    sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)

    # Unzip texts and labels from the sorted batch
    texts, labels = [text for text, label in sorted_batch], [label for text, label in sorted_batch]

    # Convert texts to indices using the word2idx function and w2v_model
    texts_indx: List[torch.Tensor] = []
    labels_indx: List[torch.Tensor] = []
    for sentence, label in zip(texts, labels):
        text_indx, label_indx = word2idx(w2v_model, sentence, label)
        texts_indx.append(text_indx)
        labels_indx.append(label_indx)

    # Calculate the lengths of each element of texts_indx.
    # The minimum length shall be 1, in order to avoid later problems when training the RNN
    lengths: List[torch.Tensor] = [torch.tensor(len(text_indx)) for text_indx in texts_indx]
    
    # Pad the text sequences to have uniform length
    texts_padded: torch.Tensor = torch.nn.utils.rnn.pad_sequence(texts_indx, batch_first=True)
    labels_padded: torch.Tensor = torch.nn.utils.rnn.pad_sequence(labels_indx, batch_first=True, padding_value=-1)

    return texts_padded, labels_padded, lengths

def collate_fn_pre_padding(batch: List[Tuple[List[str], int]]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)

    texts, labels = [text for text, label in sorted_batch], [label for text, label in sorted_batch]

    texts_indx: List[torch.Tensor] = []
    labels_indx: List[torch.Tensor] = []
    for sentence, label in zip(texts, labels):
        text_indx, label_indx = word2idx(w2v_model, sentence, label)
        texts_indx.append(text_indx)
        labels_indx.append(label_indx)

    max_len = max(len(t) for t in texts_indx)

    texts_padded = []
    labels_padded = []
    lengths = []

    for text, label in zip(texts_indx, labels_indx):
        pad_len = max_len - len(text)
        texts_padded.append(F.pad(text, (pad_len, 0), value=0))        
        labels_padded.append(F.pad(label, (pad_len, 0), value=-1))     
        lengths.append(torch.tensor(len(text)))

    return torch.stack(texts_padded), torch.stack(labels_padded), torch.stack(lengths)


In [11]:
torch.cuda.empty_cache()
embedding_weights = torch.Tensor(w2v_model.vectors).to("cpu")

In [22]:
# Define configuration for MLP Classifier training
batch_size: int = 64
epochs: int = 100
print_every: int = 5
patience: int = 5
learning_rate: float = 0.001
hidden_dims: List[int] = 64
num_layers: int = 3

# Check if GPU is available and move the model to GPU
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
device = "cuda"

In [None]:
def train_torch_model(model, train_dataloader, val_dataloader, criterion, optimizer, epochs, print_every, patience, device):
    best_val_loss = float('inf')
    epochs_without_improvement = 0

    train_accuracies: Dict[int, float] = {}
    val_accuracies: Dict[int, float] = {}

    model.to(device)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        torch.cuda.empty_cache()
        for i, (x_batch, y_batch, lengths) in enumerate(train_dataloader):
            
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            lengths = torch.Tensor(lengths).to(device)  

            optimizer.zero_grad()

            # Forward pass
            logits = model(x_batch, lengths)

            logits = logits.view(-1, logits.shape[-1])
            y_batch = y_batch.view(-1).long()

            loss = criterion(logits, y_batch)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x_batch, y_batch, lengths in val_dataloader:
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                lengths = torch.Tensor(lengths).to(device) 

                logits = model(x_batch, lengths)

                logits = logits.view(-1, logits.shape[-1])
                y_batch = y_batch.view(-1).long()

                loss = criterion(logits, y_batch)
                val_loss += loss.item()

        if epoch % print_every == 0 or epoch == epochs - 1:
            acc_t = calculate_accuracy(model, train_dataloader, device=device)
            acc_v = calculate_accuracy(model, val_dataloader, device=device)

            train_accuracies[epoch] = acc_t
            val_accuracies[epoch] = acc_v

            avg_train_loss = train_loss / len(train_dataloader)
            avg_val_loss = val_loss / len(val_dataloader)

            print(f"Epoch {epoch+1}/{epochs}: ")
            print(f"Accuracy: train {acc_t} - test {acc_v}")
            print(f"Loss: train {avg_train_loss} - test: {avg_val_loss}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print(f'Early stopping triggered at epoch {epoch + 1}')
            break

    return train_accuracies, val_accuracies

In [None]:
class NER(nn.Module):
    """
    A BiLSTM-based NER model with dropout, using pre-trained embeddings.

    Attributes:
        embedding (nn.Embedding): Pre-trained embedding layer.
        rnn (nn.LSTM): Bidirectional LSTM layer.
        dropout (nn.Dropout): Dropout for regularization.
        fc (nn.Linear): Final classification layer.
    """

    def __init__(self, embedding_weights: torch.Tensor, hidden_dim: int, num_layers: int):
        """
        Args:
            embedding_weights (torch.Tensor): Pre-trained embeddings.
            hidden_dim (int): Hidden size of LSTM.
            num_layers (int): Number of LSTM layers.
        """
        super().__init__()

        embedding_dim = embedding_weights.shape[1]

        self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=False)

        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )

        self.dropout = nn.Dropout(0.2)

        # Output layer: bidirectional LSTM → hidden_dim * 2
        self.fc = nn.Linear(hidden_dim * 2, 7)

    def forward(self, x: torch.Tensor, text_lengths: torch.Tensor) -> torch.Tensor:

        embedded: torch.Tensor = self.embedding(x)

        text_lengths = text_lengths.clamp(min=1).cpu()

        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths, batch_first=True, enforce_sorted=False
        )

        packed_output, _ = self.rnn(packed_embedded)

        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)

        output = self.dropout(output)
        
        logits = self.fc(output)  # [batch_size, seq_len, num_classes]

        return logits


In [25]:
# Instantiate the RNN classifier model
ner_model: NER = NER(embedding_weights=embedding_weights, hidden_dim=hidden_dims, num_layers=num_layers)

In [None]:
# Create data loaders with specified batch_size, shuffle the training dataloader
ner_train_dataloader: DataLoader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=False, drop_last=True)
ner_val_dataloader: DataLoader = DataLoader(vl_dataset, batch_size=batch_size, collate_fn=collate_fn, pin_memory=False, drop_last=True)
ner_test_dataloader: DataLoader = DataLoader(ts_dataset, batch_size=batch_size, collate_fn=collate_fn, pin_memory=False, drop_last=True)

In [None]:
class_counts = Counter()

for x_batch, y_batch, lengths in ner_train_dataloader:
    y_batch = y_batch.to(device)
    
    class_counts.update(y_batch.cpu().numpy().flatten())

total_samples = sum(class_counts.values())

class_weights = {label: total_samples / (len(class_counts) * count) for label, count in class_counts.items()}

weights = torch.tensor([class_weights[label] for label in range(len(class_counts)-1)], dtype=torch.float32, device=device)

ner_criterion = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=-1)
ner_optimizer = optim.Adam(ner_model.parameters(), lr=learning_rate)

In [28]:
# Train RNN classifier model
name: str = f"rnn_model_hd{hidden_dims}_nl{num_layers}_batch{batch_size}_epochs{epochs}_dropout2_bidirectional"
writer: SummaryWriter = SummaryWriter(f"runs/{name}")
ner_train_accuracies, ner_val_accuracies = train_torch_model(ner_model, ner_train_dataloader,
                                                             ner_val_dataloader, ner_criterion,
                                                             ner_optimizer, epochs, print_every,
                                                             patience=patience, device=device)
 # create folder if it does not exist
if not os.path.isdir("models"):
    os.makedirs("models")

# save scripted model
model_scripted: RecursiveScriptModule = torch.jit.script(ner_model.cpu())
model_scripted.save(f"models/{name}.pt")
writer.close()

Epoch 1/100: 
Accuracy: train 0.669922724383635 - test 0.602393068353734
Loss: train 1.690691430568695 - test: 1.4431642691294353
Epoch 6/100: 
Accuracy: train 0.9769061583577713 - test 0.9442992710768807
Loss: train 0.07750662580132485 - test: 0.6578282341361046
Early stopping triggered at epoch 10


In [29]:
# Print accuracy for training, validation and test datasets
ner_accuracy_train = calculate_accuracy(ner_model, ner_train_dataloader, device=device)
ner_accuracy_val = calculate_accuracy(ner_model, ner_val_dataloader, device=device)
ner_accuracy_test =calculate_accuracy(ner_model, ner_test_dataloader, device=device)

print(f"NER Model - Training Accuracy: {ner_accuracy_train}")
print(f"NER Model - Validation Accuracy: {ner_accuracy_val}")
print(f"NER Model - Test Accuracy: {ner_accuracy_test}")

NER Model - Training Accuracy: 0.9923367817815316
NER Model - Validation Accuracy: 0.9579150048136432
NER Model - Test Accuracy: 0.9560214375788146


In [None]:
def compute_metrics(y_true, y_pred, labels):
    """
    Calcula las métricas de error para cada clase.
    
    Args:
        y_true (Tensor): Las etiquetas verdaderas (ground truth).
        y_pred (Tensor): Las etiquetas predichas.
        labels (list): Lista de clases (etiquetas).
    
    Returns:
        pd.DataFrame: Un DataFrame con las métricas de precisión, recall, f1-score por clase.
    """
    y_true = y_true.cpu().numpy() if torch.is_tensor(y_true) else y_true
    y_pred = y_pred.cpu().numpy() if torch.is_tensor(y_pred) else y_pred

    report = classification_report(y_true, y_pred, labels=labels, output_dict=True)
    
    report_df = pd.DataFrame(report).transpose()
    
    return report_df

def evaluate_model(model, dataloader, device, labels):
    """
    Evalúa el modelo en un conjunto de datos y calcula las métricas.
    
    Args:
        model: El modelo de PyTorch.
        dataloader: El dataloader que proporciona los datos.
        device: El dispositivo (CPU o GPU).
        labels: Las clases posibles.
    
    Returns:
        pd.DataFrame: Las métricas de precisión, recall, F1, etc. en formato de tabla.
    """
    model.eval()
    y_true = []
    y_pred = []
    
    with torch.no_grad():
        for x_batch, y_batch, lengths in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            # Forward pass
            logits = model(x_batch, torch.Tensor(lengths).to(device))

            logits = logits.view(-1, logits.shape[-1])  # [batch_size * seq_len, num_classes]
            y_batch = y_batch.view(-1).long()           # [batch_size * seq_len]

            _, predicted = torch.max(logits, 1)
            
            y_true.extend(y_batch.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    metrics_df = compute_metrics(y_true, y_pred, labels)
    return metrics_df


In [31]:
train_metrics = evaluate_model(ner_model, ner_train_dataloader, device, list(range(7)))
val_metrics = evaluate_model(ner_model, ner_val_dataloader, device, list(range(7)))
test_metrics = evaluate_model(ner_model, ner_test_dataloader, device, list(range(7)))
print(train_metrics)
print(val_metrics)
print(test_metrics)

              precision    recall  f1-score  support
0              0.999797  0.991787  0.995776  54547.0
1              0.974779  0.998708  0.986599    774.0
2              0.993802  1.000000  0.996891    481.0
3              0.930140  0.998928  0.963307    933.0
4              0.001922  1.000000  0.003837    199.0
5              0.906853  0.995760  0.949230   1887.0
6              0.906994  0.999180  0.950858   1220.0
micro avg      0.364800  0.992355  0.533485  60041.0
macro avg      0.816327  0.997766  0.835214  60041.0
weighted avg   0.990230  0.992355  0.989499  60041.0
              precision    recall  f1-score  support
0              0.990174  0.977166  0.983627   6394.0
1              0.845588  0.851852  0.848708    135.0
2              0.887500  0.959459  0.922078     74.0
3              0.800000  0.850575  0.824513    174.0
4              0.003617  0.844828  0.007202     58.0
5              0.693182  0.756198  0.723320    242.0
6              0.712264  0.778351  0.743842   