<a href="https://colab.research.google.com/github/KayvanShah1/usc-csci-544-assignments-hw/blob/main/hw4/CSCI544_HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

## Installation

In [1]:
!pip install transformers datasets accelerate
!pip install ipython-autotime

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86

## Imports

In [2]:
import os
import shutil
from typing import List, Tuple, Dict

import itertools
from collections import Counter

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

import csv
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
import transformers

from dataclasses import dataclass

%load_ext autotime

time: 408 µs (started: 2023-11-10 11:16:28 +00:00)


# Config

In [3]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Set the current working directory
try:
    os.chdir("/content/drive/MyDrive/Colab Notebooks/CSCI544/HW4")
except:
    pass


class PathConfig:
    # Get the current dir
    CURRENT_DIR = os.getcwd()

    # Glove embedding path
    GLOVE_100d_File = os.path.join(CURRENT_DIR, "glove.6B.100d.txt")


class DatasetConfig:
    # General Info
    name = "conll2003"

    # Processing
    cols_to_drop =  ["id", "pos_tags", "chunk_tags"]
    rename_cols = {"ner_tags": "labels"}

    # Preprocessing
    THRESHOLD = 3
    PAD_TOKEN = "<pad>"
    UNKNOWN_TOKEN = "<unk>"
    embedding_size = 100

    # NER Tags list and converter dictionaries
    ner_tag2idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
    ner_idx2tag = {v: k for k, v in ner_tag2idx.items()}

    NUM_NER_TAGS = len(ner_tag2idx)
    SPECIAL_TOKEN_TAG = -100

time: 190 ms (started: 2023-11-10 11:16:28 +00:00)


# Helper Functions & Support Scripts

## Accelarator Configuration

In [4]:
def get_device():
    if torch.cuda.is_available():
        # Check if GPU is available
        return torch.device("cuda")
    else:
        # Use CPU if no GPU or TPU is available
        return torch.device("cpu")

device = get_device()
device

device(type='cuda')

time: 91.9 ms (started: 2023-11-10 11:16:29 +00:00)


## CoNLL evaluation functions

In [5]:
%%bash
if [ ! -f conlleval.py ]; then
    echo "Downloading conlleval.py ..."
    wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
else
    echo "File conlleval.py already exists"
fi

File conlleval.py already exists
time: 179 ms (started: 2023-11-10 11:16:29 +00:00)


In [6]:
from conlleval import evaluate

time: 330 ms (started: 2023-11-10 11:16:29 +00:00)


## Helper functions

In [7]:
# Load glove embeddings to dictionary
def load_glove_embeddings(path):
    """
    glove_emb_dict = load_glove_embeddings(PathConfig.GLOVE_100d_File)
    """
    embeddings = pd.read_csv(
        PathConfig.GLOVE_100d_File, sep=" ", quoting=csv.QUOTE_NONE, header=None, index_col=0
    )
    embeddings = {key: val.values for key, val in embeddings.T.items()}

    # Generate word-to-index mapper
    word2idx = {word: index for index, word in enumerate(embeddings.keys(), start=2)}

    # Add Special token vectors
    embeddings[DatasetConfig.UNKNOWN_TOKEN] = np.mean(np.vstack(list(embeddings.values())), axis=0)
    embeddings[DatasetConfig.PAD_TOKEN] = np.zeros(DatasetConfig.embedding_size, dtype="float64")

    # Add Special token keys to word-to-index mapper
    word2idx[DatasetConfig.PAD_TOKEN] = 0
    word2idx[DatasetConfig.UNKNOWN_TOKEN] = 1

    return word2idx, embeddings

time: 1.06 ms (started: 2023-11-10 11:16:29 +00:00)


# Download Glove Embeddings

In [8]:
%%bash
if [ ! -f glove.6B.zip ]; then
    echo "Downloading glove.6B.zip..."
    wget http://nlp.stanford.edu/data/glove.6B.zip -y
    unzip -o glove.6B.zip
else
    echo "File glove.6B.zip already exists"
fi

File glove.6B.zip already exists
time: 7.66 ms (started: 2023-11-10 11:16:29 +00:00)


In [9]:
glove_word2idx, glove_emb_dict = load_glove_embeddings(PathConfig.GLOVE_100d_File)

time: 26 s (started: 2023-11-10 11:16:29 +00:00)


# Dataset Preparation

## Process Data

In [10]:
dataset = load_dataset("conll2003")
dataset = dataset.remove_columns(DatasetConfig.cols_to_drop)
for old_name, new_name in DatasetConfig.rename_cols.items():
    dataset = dataset.rename_column(old_name, new_name)

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

time: 7.64 s (started: 2023-11-10 11:16:55 +00:00)


In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3453
    })
})

time: 1.96 ms (started: 2023-11-10 11:17:03 +00:00)


## EDA

In [12]:
df = pd.DataFrame(dataset["train"])
df.head()

Unnamed: 0,tokens,labels
0,"[EU, rejects, German, call, to, boycott, Briti...","[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,"[Peter, Blackburn]","[1, 2]"
2,"[BRUSSELS, 1996-08-22]","[5, 0]"
3,"[The, European, Commission, said, on, Thursday...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,"[Germany, 's, representative, to, the, Europea...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


time: 1.16 s (started: 2023-11-10 11:17:03 +00:00)


## Word to index mapper

In [13]:
def generate_word_indexing(dataset, threshold):
    # Count occurences of the words using itertools and Counter
    word_frequency = Counter(itertools.chain(*dataset))

    # Discard words with frequency below threshold
    word_frequency = {
        word: freq
        for word, freq in word_frequency.items()
        if freq >= threshold
    }

    # Generate indexes
    word2idx = {
        word: index
        for index, word in enumerate(word_frequency.keys(), start=2)
    }

    # Add special tokens
    word2idx[DatasetConfig.PAD_TOKEN] = 0
    word2idx[DatasetConfig.UNKNOWN_TOKEN] = 1

    return word2idx

word2idx = generate_word_indexing(dataset['train']['tokens'], threshold=DatasetConfig.THRESHOLD)

time: 602 ms (started: 2023-11-10 11:17:04 +00:00)


## Create GloVe Embeddings Matrix

In [14]:
def create_glove_embedding_matrix(word2idx, glove_emb_dict, embedding_dim):
    embedding_matrix = np.zeros((len(word2idx), embedding_dim))

    for word, idx in word2idx.items():
        if word in glove_emb_dict:
            embedding_matrix[idx] = glove_emb_dict[word]
        else:
            embedding_matrix[idx] = glove_emb_dict[DatasetConfig.UNKNOWN_TOKEN]

    return embedding_matrix

glove_embedding_matrix = create_glove_embedding_matrix(glove_word2idx, glove_emb_dict, DatasetConfig.embedding_size)

time: 794 ms (started: 2023-11-10 11:17:05 +00:00)


# Create a Pytorch dataset

In [15]:
@dataclass
class DatasetItem:
    embeddings: torch.Tensor
    targets: torch.Tensor
    original_length: int


class NERDatasetCustom(Dataset):
    def __init__(self, dataset, split, tokenizer, embedding_type="custom"):
        self.name = DatasetConfig.name
        self.dataset = dataset[split]
        self.tokenizer = tokenizer

        # Options: [custom, glove, transformer]
        self.embedding_type = embedding_type

    def __len__(self):
        return self.dataset.num_rows

    def tokenize(self, tokens):
        """
        Code to convert all tokens to their respective indexes
        """
        if self.embedding_type == "glove":
            return [
                self.tokenizer.get(token.lower(), self.tokenizer[DatasetConfig.UNKNOWN_TOKEN])
                for token in tokens
            ]
        return [
            self.tokenizer.get(token, self.tokenizer[DatasetConfig.UNKNOWN_TOKEN])
            for token in tokens
        ]

    def __getitem__(self, idx):
        if idx >= self.__len__():
            raise IndexError

        item = self.dataset[idx]

        item["input_ids"] = self.tokenize(item["tokens"])

        embeddings = item["input_ids"]
        targets = item["labels"]
        seq_len = len(targets)

        return DatasetItem(
            torch.tensor(embeddings, dtype=torch.long),
            torch.tensor(targets, dtype=torch.long),
            seq_len
        )

time: 6.39 ms (started: 2023-11-10 11:17:06 +00:00)


In [16]:
def collate_fn(data: DatasetItem, tokenizer: dict):
    embeddings, targets, og_len = [], [], []

    for item in data:
        embeddings.append(item.embeddings)
        targets.append(item.targets)
        og_len.append(item.original_length)

    # Pad the embeddings sequence
    embeddings = nn.utils.rnn.pad_sequence(
        embeddings, batch_first=True, padding_value=tokenizer[DatasetConfig.PAD_TOKEN]
    )
    # Pad the targets sequence
    targets = nn.utils.rnn.pad_sequence(
        targets, batch_first=True, padding_value=DatasetConfig.SPECIAL_TOKEN_TAG
    )

    return {"embeddings": embeddings, "targets": targets, "original_length": og_len}

time: 762 µs (started: 2023-11-10 11:17:06 +00:00)


# Training & Evaluation loop

In [17]:
def train_and_evaluate(
    model,
    train_data_loader, valid_data_loader,
    optimizer, loss_fn,
    device,
    num_epochs,
    checkpoint=False,
    path="model.pt",
    early_stopping_patience=5
):
    """
    Trains and evaluates the model.

    Args:
        model (nn.Module): The neural network model.
        train_data_loader (DataLoader): The DataLoader for training data.
        valid_data_loader (DataLoader): The DataLoader for validation data.
        optimizer (torch.optim): The optimizer for updating model weights.
        loss_fn: The loss function.
        device (torch.device): The device to perform computations.
        num_epochs (int): The number of epochs.
        checkpoint (bool, optional): Whether to save model checkpoints.
        path (str, optional): The path to save the model.
        early_stopping_patience (int, optional): Number of epochs to wait before early stopping.

    Returns:
        nn.Module: The best model.

    """
    # Create directory for saving checkpoint model states
    if checkpoint:
        dirname = path.split(".")[0]
        checkpoint_path = os.path.join(dirname)
        if os.path.exists(checkpoint_path):
            shutil.rmtree(checkpoint_path)
        os.makedirs(dirname)

    best_loss = float('inf')
    no_improvement_count = 0
    best_model = None

    for epoch in range(num_epochs):
        # Train Step
        model.train()
        train_loss = 0.0

        progress_bar = tqdm(train_data_loader, desc=f'Epoch {epoch+1}/{num_epochs}')

        for batch in progress_bar:
            embeddings = batch['embeddings'].to(device, dtype=torch.long, non_blocking=True)
            labels = batch['targets'].to(device, dtype=torch.long, non_blocking=True)
            seq_lengths = batch["original_length"]

            optimizer.zero_grad()

            outputs = model(embeddings, seq_lengths)

            outputs = outputs.view(-1, outputs.shape[-1])
            labels = labels.view(-1)
            loss = loss_fn(outputs, labels)

            loss.backward()
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            optimizer.step()

            train_loss += loss.item() * embeddings.size(1)

        train_loss /= len(train_data_loader.dataset)

        # Validation Step
        model.eval()
        valid_loss = 0.0

        with torch.no_grad():
            for batch in valid_data_loader:
                embeddings = batch['embeddings'].to(device, dtype=torch.long, non_blocking=True)
                labels = batch['targets'].to(device, dtype=torch.long, non_blocking=True)
                seq_lengths = batch["original_length"]

                outputs = model(embeddings, seq_lengths)

                outputs = outputs.view(-1, outputs.shape[-1])
                labels = labels.view(-1)
                loss = loss_fn(outputs, labels)

                valid_loss += loss.item() * embeddings.size(1)

            valid_loss /= len(valid_data_loader.dataset)

        epoch_log = (
            f"Train Loss : {round(train_loss, 4)},"
            f" Validation Loss: {round(valid_loss, 4)}"
        )
        print(epoch_log)

        # Check for improvement in validation loss
        if valid_loss < best_loss:
            # Save checkpoint if needed
            if checkpoint:
                cp = os.path.join(checkpoint_path, f"{dirname}_epoch{epoch}_loss{valid_loss:.4f}.pt")
                torch.save(model.state_dict(), cp)
                print(f"Validation loss improved from {best_loss:.4f}--->{valid_loss:.4f}")
                print(f"Saved Checkpoint to '{cp}'")

            best_loss = valid_loss
            best_model = model
            no_improvement_count = 0
        else:
            no_improvement_count += 1

            # Early stopping condition
            if no_improvement_count >= early_stopping_patience:
                print(f"No improvement for {early_stopping_patience} epochs. Stopping early.")
                break

    if checkpoint:
        # Save the best model
        best_model_path = os.path.join(checkpoint_path, f"{dirname}-best.pt")
        torch.save(best_model.state_dict(), best_model_path)
        print(f"Saved best model to '{os.path.relpath(best_model_path)}'")

    # Save current model
    torch.save(model.state_dict(), path)

    return best_model


def evaluate_model(model, data_loader, device):
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in tqdm(data_loader):
            embeddings = batch['embeddings'].to(device, dtype=torch.long, non_blocking=True)
            labels = batch['targets'].to(device, dtype=torch.long, non_blocking=True)
            seq_lengths = batch["original_length"]

            outputs = model(embeddings, seq_lengths)

            preds = torch.argmax(outputs, dim=2)

            preds = preds.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()

            for pred, label, length in zip(preds, labels, seq_lengths):
                pred = [DatasetConfig.ner_idx2tag.get(p, 'O') for p in pred[:length]]
                label = [DatasetConfig.ner_idx2tag.get(l, 'O') for l in label[:length]]
                all_preds.append(pred)
                all_labels.append(label)

    # Evaluate using conlleval
    precision, recall, f1 = evaluate(
        itertools.chain(*all_labels), itertools.chain(*all_preds)
    )

    return precision, recall, f1

time: 6.37 ms (started: 2023-11-10 11:17:06 +00:00)


# Training Config

In [18]:
TRAIN_BATCH_SIZE = 256
VALID_BATCH_SIZE = 64
TEST_BATCH_SIZE = 32
NUM_EPOCHS = 5

time: 488 µs (started: 2023-11-10 11:17:06 +00:00)


# Bidirectional LSTM model

In [19]:
class BiLSTM(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, num_tags,
        hidden_size, num_layers, lstm_output_size, dropout_val,
        embeddings_matrix = None

    ):
        """
        Recurrent Neural Network (RNN) model for sequence data processing.

        Args:
            vocab_size (int): Size of vocabulary
            embedding_dim (int): Dimension of the input features.
            num_tags (int): Number of output classes.
            hidden_size (int): Number of units in the hidden layers.
            num_layers (int): Number of recurrent layers.
            lstm_output_size (int): Size of the output from the LSTM layer.
            dropout_val (float): Dropout probability.
            embeddings_matrix (np.array): Pretrained embeddings matrix. Default is None

        """
        super(BiLSTM, self).__init__()

        # Model Attributes
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Model Layer Definition
        if embeddings_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.from_numpy(embeddings_matrix).float(),
                freeze=True
            )
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(
            embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=True
        )

        self.fc = nn.Linear(hidden_size * 2, lstm_output_size)
        self.dropout = nn.Dropout(dropout_val)
        self.elu = nn.ELU(alpha=0.01)
        self.classifier = nn.Linear(lstm_output_size, num_tags)

    def init_hidden(self, batch_size):
        hidden = (
            torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(device),
            torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(device)
        )
        return hidden

    def forward(self, x, seq_len):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)

        # Embedding Layer
        embeds = self.embedding(x).float()

        # LSTM layer
        packed_embeds = nn.utils.rnn.pack_padded_sequence(
            embeds, seq_len, batch_first=True, enforce_sorted=False
        )
        out, _ = self.lstm(packed_embeds, hidden)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        # Apply fully connected layer for final prediction
        out = self.dropout(out)
        out = self.fc(out)
        out = self.elu(out)
        out = self.classifier(out)

        return out

time: 1.41 ms (started: 2023-11-10 11:17:06 +00:00)


# Bidirectional LSTM model + Custom Embeddings

In [20]:
train_dataset = NERDatasetCustom(
    dataset = dataset,
    split='train',
    tokenizer = word2idx,
    embedding_type="default",
)

valid_dataset = NERDatasetCustom(
    dataset = dataset,
    split='validation',
    tokenizer = word2idx,
    embedding_type="default",
)

test_dataset = NERDatasetCustom(
    dataset = dataset,
    split='test',
    tokenizer = word2idx,
    embedding_type="default",
)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    drop_last=True,
    shuffle=True,
    collate_fn=lambda x: collate_fn(x, word2idx)
)

valid_data_loader = DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    drop_last=False,
    shuffle=True,
    collate_fn=lambda x: collate_fn(x, word2idx)
)

test_data_loader = DataLoader(
    test_dataset,
    batch_size=TEST_BATCH_SIZE,
    drop_last=False,
    shuffle=False,
    collate_fn=lambda x: collate_fn(x, word2idx)
)

time: 1.47 ms (started: 2023-11-10 11:17:06 +00:00)


In [21]:
vocab_size = len(word2idx)
embedding_dim = 100
hidden_size = 256
output_size = 128
num_layers = 1
dropout_val = 0.33
num_tags = DatasetConfig.NUM_NER_TAGS

net = BiLSTM(
    vocab_size, embedding_dim, num_tags,
    hidden_size, num_layers, output_size, dropout_val
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=DatasetConfig.SPECIAL_TOKEN_TAG).to(device)
# optimizer = optim.Adam(net.parameters(), lr=0.001)
# optimizer = optim.SGD(net.parameters(), lr=0.001)
optimizer = optim.AdamW(net.parameters(), lr=0.001)
# scheduler = torch.optim.lr_scheduler.()

best_model = train_and_evaluate(
    model=net,
    train_data_loader=train_data_loader,
    valid_data_loader=valid_data_loader,
    optimizer=optimizer,
    loss_fn=criterion,
    device=device,
    num_epochs=50,
    checkpoint=True,
    path="bilstm_custom_embeddings_v7.pt",
    early_stopping_patience=15
)

Epoch 1/50: 100%|██████████| 54/54 [00:07<00:00,  7.43it/s]


Train Loss : 0.1927, Validation Loss: 0.5405
Validation loss improved from inf--->0.5405
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch0_loss0.5405.pt'


Epoch 2/50: 100%|██████████| 54/54 [00:04<00:00, 11.41it/s]


Train Loss : 0.1216, Validation Loss: 0.4226
Validation loss improved from 0.5405--->0.4226
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch1_loss0.4226.pt'


Epoch 3/50: 100%|██████████| 54/54 [00:04<00:00, 11.80it/s]


Train Loss : 0.0915, Validation Loss: 0.3165
Validation loss improved from 0.4226--->0.3165
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch2_loss0.3165.pt'


Epoch 4/50: 100%|██████████| 54/54 [00:05<00:00,  9.00it/s]


Train Loss : 0.0654, Validation Loss: 0.2354
Validation loss improved from 0.3165--->0.2354
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch3_loss0.2354.pt'


Epoch 5/50: 100%|██████████| 54/54 [00:04<00:00, 11.79it/s]


Train Loss : 0.0479, Validation Loss: 0.1927
Validation loss improved from 0.2354--->0.1927
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch4_loss0.1927.pt'


Epoch 6/50: 100%|██████████| 54/54 [00:05<00:00,  9.25it/s]


Train Loss : 0.0362, Validation Loss: 0.1682
Validation loss improved from 0.1927--->0.1682
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch5_loss0.1682.pt'


Epoch 7/50: 100%|██████████| 54/54 [00:04<00:00, 11.42it/s]


Train Loss : 0.0283, Validation Loss: 0.1525
Validation loss improved from 0.1682--->0.1525
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch6_loss0.1525.pt'


Epoch 8/50: 100%|██████████| 54/54 [00:04<00:00, 11.59it/s]


Train Loss : 0.0222, Validation Loss: 0.1517
Validation loss improved from 0.1525--->0.1517
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch7_loss0.1517.pt'


Epoch 9/50: 100%|██████████| 54/54 [00:07<00:00,  7.59it/s]


Train Loss : 0.0182, Validation Loss: 0.1475
Validation loss improved from 0.1517--->0.1475
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch8_loss0.1475.pt'


Epoch 10/50: 100%|██████████| 54/54 [00:04<00:00, 11.64it/s]


Train Loss : 0.0145, Validation Loss: 0.1437
Validation loss improved from 0.1475--->0.1437
Saved Checkpoint to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7_epoch9_loss0.1437.pt'


Epoch 11/50: 100%|██████████| 54/54 [00:06<00:00,  8.76it/s]


Train Loss : 0.0122, Validation Loss: 0.1497


Epoch 12/50: 100%|██████████| 54/54 [00:04<00:00, 11.31it/s]


Train Loss : 0.0097, Validation Loss: 0.155


Epoch 13/50: 100%|██████████| 54/54 [00:04<00:00, 11.15it/s]


Train Loss : 0.0082, Validation Loss: 0.1596


Epoch 14/50: 100%|██████████| 54/54 [00:05<00:00,  9.49it/s]


Train Loss : 0.0063, Validation Loss: 0.1621


Epoch 15/50: 100%|██████████| 54/54 [00:04<00:00, 11.74it/s]


Train Loss : 0.0055, Validation Loss: 0.1753


Epoch 16/50: 100%|██████████| 54/54 [00:05<00:00,  9.25it/s]


Train Loss : 0.0048, Validation Loss: 0.1847


Epoch 17/50: 100%|██████████| 54/54 [00:04<00:00, 11.54it/s]


Train Loss : 0.004, Validation Loss: 0.1963


Epoch 18/50: 100%|██████████| 54/54 [00:04<00:00, 11.94it/s]


Train Loss : 0.0034, Validation Loss: 0.2025


Epoch 19/50: 100%|██████████| 54/54 [00:05<00:00,  9.23it/s]


Train Loss : 0.0029, Validation Loss: 0.2086


Epoch 20/50: 100%|██████████| 54/54 [00:04<00:00, 11.94it/s]


Train Loss : 0.0024, Validation Loss: 0.2274


Epoch 21/50: 100%|██████████| 54/54 [00:05<00:00,  9.65it/s]


Train Loss : 0.0022, Validation Loss: 0.2195


Epoch 22/50: 100%|██████████| 54/54 [00:04<00:00, 11.08it/s]


Train Loss : 0.0021, Validation Loss: 0.2279


Epoch 23/50: 100%|██████████| 54/54 [00:04<00:00, 11.91it/s]


Train Loss : 0.0019, Validation Loss: 0.2432


Epoch 24/50: 100%|██████████| 54/54 [00:05<00:00,  9.10it/s]


Train Loss : 0.0018, Validation Loss: 0.2382


Epoch 25/50: 100%|██████████| 54/54 [00:04<00:00, 11.20it/s]


Train Loss : 0.0016, Validation Loss: 0.2442
No improvement for 15 epochs. Stopping early.
Saved best model to 'bilstm_custom_embeddings_v7/bilstm_custom_embeddings_v7-best.pt'
time: 2min 43s (started: 2023-11-10 11:17:06 +00:00)


Evaluate model on Validation set

In [22]:
precision, recall, f1 = evaluate_model(best_model, valid_data_loader, device)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

100%|██████████| 51/51 [00:00<00:00, 55.44it/s]


processed 51362 tokens with 5942 phrases; found: 5609 phrases; correct: 4468.
accuracy:  77.97%; (non-O)
accuracy:  95.55%; precision:  79.66%; recall:  75.19%; FB1:  77.36
              LOC: precision:  87.41%; recall:  83.94%; FB1:  85.64  1764
             MISC: precision:  74.81%; recall:  74.73%; FB1:  74.77  921
              ORG: precision:  73.18%; recall:  67.34%; FB1:  70.14  1234
              PER: precision:  78.93%; recall:  72.42%; FB1:  75.54  1690
Precision: 79.66, Recall: 75.19, F1 Score: 77.36
time: 1.03 s (started: 2023-11-10 11:19:49 +00:00)


Evaluate model on Test set

In [23]:
precision, recall, f1 = evaluate_model(best_model, test_data_loader, device)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

100%|██████████| 108/108 [00:01<00:00, 70.05it/s]


processed 46435 tokens with 5648 phrases; found: 5274 phrases; correct: 3732.
accuracy:  70.46%; (non-O)
accuracy:  93.39%; precision:  70.76%; recall:  66.08%; FB1:  68.34
              LOC: precision:  81.46%; recall:  76.68%; FB1:  79.00  1570
             MISC: precision:  61.89%; recall:  63.39%; FB1:  62.63  719
              ORG: precision:  65.31%; recall:  59.96%; FB1:  62.52  1525
              PER: precision:  69.32%; recall:  62.59%; FB1:  65.78  1460
Precision: 70.76, Recall: 66.08, F1 Score: 68.34
time: 1.71 s (started: 2023-11-10 11:19:50 +00:00)


# Bidirectional LSTM Model + Glove Embeddings

In [24]:
train_dataset = NERDatasetCustom(
    dataset = dataset,
    split='train',
    tokenizer = glove_word2idx,
    embedding_type="glove",
)

valid_dataset = NERDatasetCustom(
    dataset = dataset,
    split='validation',
    tokenizer = glove_word2idx,
    embedding_type="glove",
)

test_dataset = NERDatasetCustom(
    dataset = dataset,
    split='test',
    tokenizer = glove_word2idx,
    embedding_type="glove",
)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    drop_last=True,
    shuffle=True,
    collate_fn=lambda x: collate_fn(x, glove_word2idx)
)

valid_data_loader = DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    drop_last=False,
    shuffle=True,
    collate_fn=lambda x: collate_fn(x, glove_word2idx)
)

test_data_loader = DataLoader(
    test_dataset,
    batch_size=TEST_BATCH_SIZE,
    drop_last=False,
    shuffle=False,
    collate_fn=lambda x: collate_fn(x, glove_word2idx)
)

time: 1.98 ms (started: 2023-11-10 11:19:52 +00:00)


In [26]:
vocab_size = len(glove_word2idx)
embedding_dim = 100
hidden_size = 256
output_size = 128
num_layers = 1
dropout_val = 0.33
num_tags = DatasetConfig.NUM_NER_TAGS

net_with_glove = BiLSTM(
    vocab_size, embedding_dim, num_tags,
    hidden_size, num_layers, output_size, dropout_val,
    glove_embedding_matrix
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=DatasetConfig.SPECIAL_TOKEN_TAG).to(device)
# optimizer = optim.Adam(net_with_glove.parameters(), lr=0.001)
# optimizer = optim.SGD(net_with_glove.parameters(), lr=0.001)
optimizer = optim.AdamW(net_with_glove.parameters(), lr=0.001)

best_model_glove = train_and_evaluate(
    model=net_with_glove,
    train_data_loader=train_data_loader,
    valid_data_loader=valid_data_loader,
    optimizer=optimizer,
    loss_fn=criterion,
    device=device,
    num_epochs=50,
    checkpoint=True,
    path="bilstm_glove_embeddings_v1.pt",
    early_stopping_patience=15
)

Epoch 1/50: 100%|██████████| 54/54 [00:08<00:00,  6.27it/s]


Train Loss : 0.1741, Validation Loss: 0.4599
Validation loss improved from inf--->0.4599
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch0_loss0.4599.pt'


Epoch 2/50: 100%|██████████| 54/54 [00:10<00:00,  5.30it/s]


Train Loss : 0.0849, Validation Loss: 0.2469
Validation loss improved from 0.4599--->0.2469
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch1_loss0.2469.pt'


Epoch 3/50: 100%|██████████| 54/54 [00:04<00:00, 11.14it/s]


Train Loss : 0.0502, Validation Loss: 0.1693
Validation loss improved from 0.2469--->0.1693
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch2_loss0.1693.pt'


Epoch 4/50: 100%|██████████| 54/54 [00:06<00:00,  7.78it/s]


Train Loss : 0.0368, Validation Loss: 0.131
Validation loss improved from 0.1693--->0.1310
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch3_loss0.1310.pt'


Epoch 5/50: 100%|██████████| 54/54 [00:04<00:00, 11.03it/s]


Train Loss : 0.0305, Validation Loss: 0.1131
Validation loss improved from 0.1310--->0.1131
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch4_loss0.1131.pt'


Epoch 6/50: 100%|██████████| 54/54 [00:07<00:00,  7.35it/s]


Train Loss : 0.026, Validation Loss: 0.1014
Validation loss improved from 0.1131--->0.1014
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch5_loss0.1014.pt'


Epoch 7/50: 100%|██████████| 54/54 [00:04<00:00, 10.96it/s]


Train Loss : 0.0235, Validation Loss: 0.0939
Validation loss improved from 0.1014--->0.0939
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch6_loss0.0939.pt'


Epoch 8/50: 100%|██████████| 54/54 [00:07<00:00,  7.55it/s]


Train Loss : 0.0207, Validation Loss: 0.092
Validation loss improved from 0.0939--->0.0920
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch7_loss0.0920.pt'


Epoch 9/50: 100%|██████████| 54/54 [00:04<00:00, 11.34it/s]


Train Loss : 0.0188, Validation Loss: 0.0832
Validation loss improved from 0.0920--->0.0832
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch8_loss0.0832.pt'


Epoch 10/50: 100%|██████████| 54/54 [00:07<00:00,  7.20it/s]


Train Loss : 0.0171, Validation Loss: 0.0826
Validation loss improved from 0.0832--->0.0826
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch9_loss0.0826.pt'


Epoch 11/50: 100%|██████████| 54/54 [00:04<00:00, 11.46it/s]


Train Loss : 0.0159, Validation Loss: 0.0798
Validation loss improved from 0.0826--->0.0798
Saved Checkpoint to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1_epoch10_loss0.0798.pt'


Epoch 12/50: 100%|██████████| 54/54 [00:07<00:00,  7.17it/s]


Train Loss : 0.0143, Validation Loss: 0.0828


Epoch 13/50: 100%|██████████| 54/54 [00:04<00:00, 11.04it/s]


Train Loss : 0.0132, Validation Loss: 0.08


Epoch 14/50: 100%|██████████| 54/54 [00:06<00:00,  8.65it/s]


Train Loss : 0.0119, Validation Loss: 0.0801


Epoch 15/50: 100%|██████████| 54/54 [00:05<00:00,  9.39it/s]


Train Loss : 0.0105, Validation Loss: 0.0862


Epoch 16/50: 100%|██████████| 54/54 [00:05<00:00,  9.93it/s]


Train Loss : 0.0096, Validation Loss: 0.0804


Epoch 17/50: 100%|██████████| 54/54 [00:05<00:00, 10.63it/s]


Train Loss : 0.0088, Validation Loss: 0.0859


Epoch 18/50: 100%|██████████| 54/54 [00:04<00:00, 11.66it/s]


Train Loss : 0.0079, Validation Loss: 0.0863


Epoch 19/50: 100%|██████████| 54/54 [00:06<00:00,  8.53it/s]


Train Loss : 0.0072, Validation Loss: 0.0879


Epoch 20/50: 100%|██████████| 54/54 [00:04<00:00, 11.40it/s]


Train Loss : 0.0065, Validation Loss: 0.0877


Epoch 21/50: 100%|██████████| 54/54 [00:05<00:00,  9.70it/s]


Train Loss : 0.0054, Validation Loss: 0.0937


Epoch 22/50: 100%|██████████| 54/54 [00:05<00:00, 10.63it/s]


Train Loss : 0.0049, Validation Loss: 0.0926


Epoch 23/50: 100%|██████████| 54/54 [00:04<00:00, 11.74it/s]


Train Loss : 0.0043, Validation Loss: 0.0987


Epoch 24/50: 100%|██████████| 54/54 [00:06<00:00,  8.49it/s]


Train Loss : 0.0039, Validation Loss: 0.1072


Epoch 25/50: 100%|██████████| 54/54 [00:04<00:00, 11.40it/s]


Train Loss : 0.0034, Validation Loss: 0.1063


Epoch 26/50: 100%|██████████| 54/54 [00:05<00:00,  9.59it/s]


Train Loss : 0.0033, Validation Loss: 0.1073
No improvement for 15 epochs. Stopping early.
Saved best model to 'bilstm_glove_embeddings_v1/bilstm_glove_embeddings_v1-best.pt'
time: 3min 9s (started: 2023-11-10 11:21:08 +00:00)


Evaluate model on Validation set

In [27]:
precision, recall, f1 = evaluate_model(best_model_glove, valid_data_loader, device)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

100%|██████████| 51/51 [00:00<00:00, 51.37it/s]


processed 51362 tokens with 5942 phrases; found: 5859 phrases; correct: 5209.
accuracy:  87.46%; (non-O)
accuracy:  97.47%; precision:  88.91%; recall:  87.66%; FB1:  88.28
              LOC: precision:  92.62%; recall:  92.22%; FB1:  92.42  1829
             MISC: precision:  82.17%; recall:  77.98%; FB1:  80.02  875
              ORG: precision:  81.52%; recall:  78.60%; FB1:  80.03  1293
              PER: precision:  93.56%; recall:  94.57%; FB1:  94.06  1862
Precision: 88.91, Recall: 87.66, F1 Score: 88.28
time: 1.39 s (started: 2023-11-10 11:24:26 +00:00)


Evaluate model on Test set

In [28]:
precision, recall, f1 = evaluate_model(best_model_glove, test_data_loader, device)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

100%|██████████| 108/108 [00:02<00:00, 52.93it/s]


processed 46435 tokens with 5648 phrases; found: 5642 phrases; correct: 4684.
accuracy:  84.65%; (non-O)
accuracy:  96.50%; precision:  83.02%; recall:  82.93%; FB1:  82.98
              LOC: precision:  87.12%; recall:  88.37%; FB1:  87.74  1692
             MISC: precision:  69.07%; recall:  69.66%; FB1:  69.36  708
              ORG: precision:  77.72%; recall:  76.88%; FB1:  77.30  1643
              PER: precision:  90.31%; recall:  89.30%; FB1:  89.80  1599
Precision: 83.02, Recall: 82.93, F1 Score: 82.98
time: 2.22 s (started: 2023-11-10 11:24:31 +00:00)


# References

1. https://huggingface.co/datasets/conll2003
2. https://huggingface.co/docs/datasets/installation
3. https://huggingface.co/docs/transformers/installation
4. https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
5. https://stackoverflow.com/a/52070223/12639940
6. https://github.com/sighsmile/conlleval
7. https://nlp.stanford.edu/data/glove.6B.zip
8. https://stats.stackexchange.com/questions/248715/selection-of-values-for-padding-tokens-in-sentence-classification-with-word-embe
