<a href="https://colab.research.google.com/github/KayvanShah1/usc-csci-544-assignments-hw/blob/main/hw4/CSCI544_HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

## Installation

In [1]:
!pip install transformers datasets accelerate
!pip install ipython-autotime

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86

## Imports

In [2]:
import os
import shutil
from typing import List, Tuple, Dict

import itertools
from collections import Counter

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

import csv
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
import transformers

from dataclasses import dataclass

%load_ext autotime

time: 377 µs (started: 2023-11-10 07:46:59 +00:00)


# Config

In [3]:
# Set the current working directory
try:
    os.chdir("/content/drive/MyDrive/Colab Notebooks/CSCI544/HW4")
except:
    pass


class PathConfig:
    # Get the current dir
    CURRENT_DIR = os.getcwd()

    # Glove embedding path
    GLOVE_100d_File = os.path.join(CURRENT_DIR, "glove.6B.100d.txt")


class DatasetConfig:
    # General Info
    name = "conll2003"

    # Processing
    cols_to_drop =  ["id", "pos_tags", "chunk_tags"]
    rename_cols = {"ner_tags": "labels"}

    # Preprocessing
    THRESHOLD = 3
    PAD_TOKEN = "<pad>"
    UNKNOWN_TOKEN = "<unk>"
    embedding_size = 100

    # NER Tags list and converter dictionaries
    ner_tag2idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
    ner_idx2tag = {v: k for k, v in ner_tag2idx.items()}

    NUM_NER_TAGS = len(ner_tag2idx)
    SPECIAL_TOKEN_TAG = -100

time: 422 ms (started: 2023-11-10 07:46:59 +00:00)


# Helper Functions & Support Scripts

## Accelarator Configuration

In [4]:
def get_device():
    if torch.cuda.is_available():
        # Check if GPU is available
        return torch.device("cuda")
    else:
        # Use CPU if no GPU or TPU is available
        return torch.device("cpu")

device = get_device()
device

device(type='cuda')

time: 93.7 ms (started: 2023-11-10 07:46:59 +00:00)


## CoNLL evaluation functions

In [5]:
%%bash
if [ ! -f conlleval.py ]; then
    echo "Downloading conlleval.py ..."
    wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
else
    echo "File conlleval.py already exists"
fi

File conlleval.py already exists
time: 341 ms (started: 2023-11-10 07:46:59 +00:00)


In [6]:
from conlleval import evaluate

time: 1.5 s (started: 2023-11-10 07:47:00 +00:00)


## Helper functions

In [7]:
# Load glove embeddings to dictionary
def load_glove_embeddings(path):
    """
    pretrained_model = load_glove_embeddings(PathConfig.GLOVE_100d_File)
    """
    embeddings = pd.read_csv(
        PathConfig.GLOVE_100d_File, sep=" ", quoting=csv.QUOTE_NONE, header=None, index_col=0
    )
    embeddings = {key: val.values for key, val in embeddings.T.items()}

    # Add Special token vectors
    embeddings[DatasetConfig.UNKNOWN_TOKEN] = np.full(DatasetConfig.embedding_size, -1)
    embeddings[DatasetConfig.PAD_TOKEN] = np.zeros(DatasetConfig.embedding_size)

    return embeddings

time: 492 µs (started: 2023-11-10 07:47:01 +00:00)


# Download Glove Embeddings

In [8]:
%%bash
if [ ! -f glove.6B.zip ]; then
    echo "Downloading glove.6B.zip..."
    wget http://nlp.stanford.edu/data/glove.6B.zip -y
    unzip -o glove.6B.zip
else
    echo "File glove.6B.zip already exists"
fi

File glove.6B.zip already exists
time: 8.04 ms (started: 2023-11-10 07:47:01 +00:00)


In [9]:
pretrained_model = load_glove_embeddings(PathConfig.GLOVE_100d_File)

time: 23.5 s (started: 2023-11-10 07:47:01 +00:00)


# Dataset Preparation

## Process Data

In [10]:
dataset = load_dataset("conll2003")
dataset = dataset.remove_columns(DatasetConfig.cols_to_drop)
for old_name, new_name in DatasetConfig.rename_cols.items():
    dataset = dataset.rename_column(old_name, new_name)

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

time: 12.8 s (started: 2023-11-10 07:47:25 +00:00)


In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3453
    })
})

time: 3.57 ms (started: 2023-11-10 07:47:37 +00:00)


## EDA

In [12]:
df = pd.DataFrame(dataset["train"])
df.head()

Unnamed: 0,tokens,labels
0,"[EU, rejects, German, call, to, boycott, Briti...","[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,"[Peter, Blackburn]","[1, 2]"
2,"[BRUSSELS, 1996-08-22]","[5, 0]"
3,"[The, European, Commission, said, on, Thursday...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,"[Germany, 's, representative, to, the, Europea...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


time: 886 ms (started: 2023-11-10 07:47:37 +00:00)


## Word to index mapper

In [13]:
# # Count occurences of the words using itertools and Counter
# word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))

# # Discard words with frequency below THRESHOLD
# word_frequency = {
#     word: freq
#     for word, freq in word_frequency.items()
#     if freq >= DatasetConfig.THRESHOLD
# }

# # Generate indexes
# word2idx = {
#     word: index
#     for index, word in enumerate(word_frequency.keys(), start=2)
# }

# # Add special tokens
# word2idx[DatasetConfig.PAD_TOKEN] = 0
# word2idx[DatasetConfig.UNKNOWN_TOKEN] = 1

def generate_word_indexing(dataset, threshold):
    # Count occurences of the words using itertools and Counter
    word_frequency = Counter(itertools.chain(*dataset))

    # Discard words with frequency below threshold
    word_frequency = {
        word: freq
        for word, freq in word_frequency.items()
        if freq >= threshold
    }

    # Generate indexes
    word2idx = {
        word: index
        for index, word in enumerate(word_frequency.keys(), start=2)
    }

    # Add special tokens
    word2idx[DatasetConfig.PAD_TOKEN] = 0
    word2idx[DatasetConfig.UNKNOWN_TOKEN] = 1

    return word2idx

word2idx = generate_word_indexing(dataset['train']['tokens'], threshold=DatasetConfig.THRESHOLD)

time: 251 ms (started: 2023-11-10 07:47:38 +00:00)


# Create a Pytorch dataset

In [14]:
@dataclass
class DatasetItem:
    embeddings: torch.Tensor
    targets: torch.Tensor
    original_length: int


class NERDatasetCustom(Dataset):
    def __init__(self, dataset, split, tokenizer, embedding_type="custom"):
        self.name = DatasetConfig.name
        self.dataset = dataset[split]
        self.tokenizer = tokenizer

        # Options: [custom, glove, transformer]
        self.embedding_type = embedding_type

    def __len__(self):
        return self.dataset.num_rows

    def tokenize(self, tokens):
        """
        Code to convert all tokens to their respective indexes
        """
        return [
            self.tokenizer.get(token, self.tokenizer[DatasetConfig.UNKNOWN_TOKEN])
            for token in tokens
        ]

    def __getitem__(self, idx):
        if idx >= self.__len__():
            raise IndexError

        item = self.dataset[idx]

        item["input_ids"] = self.tokenize(item["tokens"])

        embeddings = item["input_ids"]
        targets = item["labels"]
        seq_len = len(targets)

        return DatasetItem(
            torch.tensor(embeddings, dtype=torch.long),
            torch.tensor(targets, dtype=torch.long),
            seq_len
        )

time: 1.56 ms (started: 2023-11-10 07:47:39 +00:00)


In [15]:
def collate_fn(data: DatasetItem):
    embeddings, targets, og_len = [], [], []

    for item in data:
        embeddings.append(item.embeddings)
        targets.append(item.targets)
        og_len.append(item.original_length)

    # Pad the embeddings sequence
    embeddings = nn.utils.rnn.pad_sequence(
        embeddings, batch_first=True, padding_value=word2idx[DatasetConfig.PAD_TOKEN]
    )
    targets = nn.utils.rnn.pad_sequence(
        targets, batch_first=True, padding_value=DatasetConfig.SPECIAL_TOKEN_TAG
    )

    return {"embeddings": embeddings, "targets": targets, "original_length": og_len}

time: 495 µs (started: 2023-11-10 07:47:39 +00:00)


# Training & Evaluation loop

In [20]:
def train_and_evaluate(
    model,
    train_data_loader, valid_data_loader,
    optimizer, loss_fn,
    device,
    num_epochs,
    checkpoint=False,
    path="model.pt",
    early_stopping_patience=5
):
    """
    Trains and evaluates the model.

    Args:
        model (nn.Module): The neural network model.
        train_data_loader (DataLoader): The DataLoader for training data.
        valid_data_loader (DataLoader): The DataLoader for validation data.
        optimizer (torch.optim): The optimizer for updating model weights.
        loss_fn: The loss function.
        device (torch.device): The device to perform computations.
        num_epochs (int): The number of epochs.
        checkpoint (bool, optional): Whether to save model checkpoints.
        path (str, optional): The path to save the model.
        early_stopping_patience (int, optional): Number of epochs to wait before early stopping.

    Returns:
        nn.Module: The best model.

    """
    # Create directory for saving checkpoint model states
    if checkpoint:
        dirname = path.split(".")[0]
        checkpoint_path = os.path.join(dirname)
        if os.path.exists(checkpoint_path):
            shutil.rmtree(checkpoint_path)
        os.makedirs(dirname)

    best_loss = float('inf')
    no_improvement_count = 0
    best_model = None

    for epoch in range(num_epochs):
        # Train Step
        model.train()
        train_loss = 0.0

        progress_bar = tqdm(train_data_loader, desc=f'Epoch {epoch+1}/{num_epochs}')

        for batch in progress_bar:
            embeddings = batch['embeddings'].to(device, dtype=torch.long, non_blocking=True)
            labels = batch['targets'].to(device, dtype=torch.long, non_blocking=True)
            seq_lengths = batch["original_length"]

            optimizer.zero_grad()

            outputs = model(embeddings, seq_lengths)

            outputs = outputs.view(-1, outputs.shape[-1])
            labels = labels.view(-1)
            loss = loss_fn(outputs, labels)

            loss.backward()
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            optimizer.step()

            train_loss += loss.item() * embeddings.size(1)

        train_loss /= len(train_data_loader.dataset)

        # Validation Step
        model.eval()
        valid_loss = 0.0

        with torch.no_grad():
            for batch in valid_data_loader:
                embeddings = batch['embeddings'].to(device, dtype=torch.long, non_blocking=True)
                labels = batch['targets'].to(device, dtype=torch.long, non_blocking=True)
                seq_lengths = batch["original_length"]

                outputs = model(embeddings, seq_lengths)

                outputs = outputs.view(-1, outputs.shape[-1])
                labels = labels.view(-1)
                loss = loss_fn(outputs, labels)

                valid_loss += loss.item() * embeddings.size(1)

            valid_loss /= len(valid_data_loader.dataset)

        epoch_log = (
            f"Train Loss : {round(train_loss, 4)},"
            f" Validation Loss: {round(valid_loss, 4)}"
        )
        print(epoch_log)

        # Check for improvement in validation loss
        if valid_loss < best_loss:
            # Save checkpoint if needed
            if checkpoint:
                cp = os.path.join(checkpoint_path, f"{dirname}_epoch{epoch}_loss{valid_loss:.4f}.pt")
                torch.save(model.state_dict(), cp)
                print(f"Validation loss improved from {best_loss:.4f}--->{valid_loss:.4f}")
                print(f"Saved Checkpoint to '{cp}'")

            best_loss = valid_loss
            best_model = model
            no_improvement_count = 0
        else:
            no_improvement_count += 1

            # Early stopping condition
            if no_improvement_count >= early_stopping_patience:
                print(f"No improvement for {early_stopping_patience} epochs. Stopping early.")
                break

    if checkpoint:
        # Save the best model
        best_model_path = os.path.join(checkpoint_path, f"{dirname}-best.pt")
        torch.save(best_model.state_dict(), best_model_path)
        print(f"Saved best model to '{os.path.relpath(best_model_path)}'")

    # Save current model
    torch.save(model.state_dict(), path)

    return best_model


def evaluate_model(model, data_loader):
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in tqdm(data_loader):
            embeddings = batch['embeddings'].to(device, dtype=torch.long, non_blocking=True)
            labels = batch['targets'].to(device, dtype=torch.long, non_blocking=True)
            seq_lengths = batch["original_length"]

            outputs = model(embeddings, seq_lengths)

            preds = torch.argmax(outputs, dim=2)

            preds = preds.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()

            for pred, label, length in zip(preds, labels, seq_lengths):
                pred = [DatasetConfig.ner_idx2tag.get(p, 'O') for p in pred[:length]]
                label = [DatasetConfig.ner_idx2tag.get(l, 'O') for l in label[:length]]
                all_preds.append(pred)
                all_labels.append(label)

    # Evaluate using conlleval
    precision, recall, f1 = evaluate(
        itertools.chain(*all_labels), itertools.chain(*all_preds)
    )

    return precision, recall, f1

time: 4.85 ms (started: 2023-11-10 07:48:38 +00:00)


# Training Config

In [21]:
TRAIN_BATCH_SIZE = 256
VALID_BATCH_SIZE = 64
TEST_BATCH_SIZE = 32
NUM_EPOCHS = 5

time: 710 µs (started: 2023-11-10 07:48:42 +00:00)


# Bidirectional LSTM model

In [18]:
class BiLSTM(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, num_tags,
        hidden_size, num_layers, lstm_output_size, dropout_val
    ):
        """
        Recurrent Neural Network (RNN) model for sequence data processing.

        Args:
            vocab_size (int): Size of vocabulary
            embedding_dim (int): Dimension of the input features.
            num_tags (int): Number of output classes.
            hidden_size (int): Number of units in the hidden layers.
            num_layers (int): Number of recurrent layers.
            lstm_output_size (int): Size of the output from the LSTM layer.
            dropout_val (float): Dropout probability.

        """
        super(BiLSTM, self).__init__()

        # Model Attributes
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Model Layer Definition
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(
            embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=True
        )

        self.fc = nn.Linear(hidden_size * 2, lstm_output_size)
        self.dropout = nn.Dropout(dropout_val)
        self.elu = nn.ELU(alpha=0.01)
        self.classifier = nn.Linear(lstm_output_size, num_tags)

    def init_hidden(self, batch_size):
        hidden = (
            torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(device),
            torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(device)
        )
        return hidden

    def forward(self, x, seq_len):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)

        # Embedding Layer
        embeds = self.embedding(x).float()

        # LSTM layer
        packed_embeds = nn.utils.rnn.pack_padded_sequence(
            embeds, seq_len, batch_first=True, enforce_sorted=False
        )
        out, _ = self.lstm(packed_embeds, hidden)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        # Apply fully connected layer for final prediction
        out = self.dropout(out)
        out = self.fc(out)
        out = self.elu(out)
        out = self.classifier(out)

        return out

time: 2.17 ms (started: 2023-11-10 07:47:39 +00:00)


In [22]:
train_dataset = NERDatasetCustom(
    dataset = dataset,
    split='train',
    tokenizer = word2idx,
    embedding_type="default",
)

valid_dataset = NERDatasetCustom(
    dataset = dataset,
    split='validation',
    tokenizer = word2idx,
    embedding_type="default",
)

test_dataset = NERDatasetCustom(
    dataset = dataset,
    split='test',
    tokenizer = word2idx,
    embedding_type="default",
)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    drop_last=True,
    shuffle=True,
    collate_fn=collate_fn
)

valid_data_loader = DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    drop_last=False,
    shuffle=True,
    collate_fn=collate_fn
)

test_data_loader = DataLoader(
    test_dataset,
    batch_size=TEST_BATCH_SIZE,
    drop_last=False,
    shuffle=False,
    collate_fn=collate_fn
)

time: 1.21 ms (started: 2023-11-10 07:49:05 +00:00)


In [23]:
vocab_size = len(word2idx)
embedding_dim = 100
hidden_size = 256
output_size = 128
num_layers = 1
dropout_val = 0.33
num_tags = DatasetConfig.NUM_NER_TAGS

net = BiLSTM(
    vocab_size, embedding_dim, num_tags,
    hidden_size, num_layers, output_size, dropout_val
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=DatasetConfig.SPECIAL_TOKEN_TAG).to(device)
# optimizer = optim.Adam(net.parameters(), lr=0.001)
# optimizer = optim.SGD(net.parameters(), lr=0.001)
optimizer = optim.AdamW(net.parameters(), lr=0.001)
# scheduler = torch.optim.lr_scheduler.()

best_model = train_and_evaluate(
    model=net,
    train_data_loader=train_data_loader,
    valid_data_loader=valid_data_loader,
    optimizer=optimizer,
    loss_fn=criterion,
    device=device,
    num_epochs=50,
    checkpoint=True,
    path="bilstm_custom_embeddings_v5.pt",
    early_stopping_patience=15
)

Epoch 1/50: 100%|██████████| 54/54 [00:05<00:00, 10.11it/s]


Train Loss : 0.1883, Validation Loss: 0.5411
Validation loss improved from inf--->0.5411
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch0_loss0.5411.pt'


Epoch 2/50: 100%|██████████| 54/54 [00:03<00:00, 16.35it/s]


Train Loss : 0.1216, Validation Loss: 0.4342
Validation loss improved from 0.5411--->0.4342
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch1_loss0.4342.pt'


Epoch 3/50: 100%|██████████| 54/54 [00:03<00:00, 16.00it/s]


Train Loss : 0.0932, Validation Loss: 0.3317
Validation loss improved from 0.4342--->0.3317
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch2_loss0.3317.pt'


Epoch 4/50: 100%|██████████| 54/54 [00:04<00:00, 11.91it/s]


Train Loss : 0.0673, Validation Loss: 0.2493
Validation loss improved from 0.3317--->0.2493
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch3_loss0.2493.pt'


Epoch 5/50: 100%|██████████| 54/54 [00:03<00:00, 16.15it/s]


Train Loss : 0.0468, Validation Loss: 0.2004
Validation loss improved from 0.2493--->0.2004
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch4_loss0.2004.pt'


Epoch 6/50: 100%|██████████| 54/54 [00:03<00:00, 16.01it/s]


Train Loss : 0.036, Validation Loss: 0.1733
Validation loss improved from 0.2004--->0.1733
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch5_loss0.1733.pt'


Epoch 7/50: 100%|██████████| 54/54 [00:04<00:00, 13.30it/s]


Train Loss : 0.0278, Validation Loss: 0.1607
Validation loss improved from 0.1733--->0.1607
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch6_loss0.1607.pt'


Epoch 8/50: 100%|██████████| 54/54 [00:03<00:00, 13.82it/s]


Train Loss : 0.0222, Validation Loss: 0.1496
Validation loss improved from 0.1607--->0.1496
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch7_loss0.1496.pt'


Epoch 9/50: 100%|██████████| 54/54 [00:03<00:00, 16.18it/s]


Train Loss : 0.018, Validation Loss: 0.1423
Validation loss improved from 0.1496--->0.1423
Saved Checkpoint to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5_epoch8_loss0.1423.pt'


Epoch 10/50: 100%|██████████| 54/54 [00:03<00:00, 16.20it/s]


Train Loss : 0.0143, Validation Loss: 0.1479


Epoch 11/50: 100%|██████████| 54/54 [00:04<00:00, 11.63it/s]


Train Loss : 0.0118, Validation Loss: 0.1502


Epoch 12/50: 100%|██████████| 54/54 [00:03<00:00, 16.10it/s]


Train Loss : 0.0097, Validation Loss: 0.1572


Epoch 13/50: 100%|██████████| 54/54 [00:03<00:00, 16.08it/s]


Train Loss : 0.008, Validation Loss: 0.1577


Epoch 14/50: 100%|██████████| 54/54 [00:03<00:00, 13.70it/s]


Train Loss : 0.0064, Validation Loss: 0.1801


Epoch 15/50: 100%|██████████| 54/54 [00:04<00:00, 13.41it/s]


Train Loss : 0.0054, Validation Loss: 0.177


Epoch 16/50: 100%|██████████| 54/54 [00:03<00:00, 16.35it/s]


Train Loss : 0.0045, Validation Loss: 0.1892


Epoch 17/50: 100%|██████████| 54/54 [00:03<00:00, 16.24it/s]


Train Loss : 0.0038, Validation Loss: 0.1939


Epoch 18/50: 100%|██████████| 54/54 [00:04<00:00, 11.05it/s]


Train Loss : 0.0034, Validation Loss: 0.1968


Epoch 19/50: 100%|██████████| 54/54 [00:03<00:00, 16.24it/s]


Train Loss : 0.003, Validation Loss: 0.2068


Epoch 20/50: 100%|██████████| 54/54 [00:03<00:00, 15.99it/s]


Train Loss : 0.0025, Validation Loss: 0.224


Epoch 21/50: 100%|██████████| 54/54 [00:03<00:00, 14.15it/s]


Train Loss : 0.0022, Validation Loss: 0.2224


Epoch 22/50: 100%|██████████| 54/54 [00:04<00:00, 13.28it/s]


Train Loss : 0.002, Validation Loss: 0.2254


Epoch 23/50: 100%|██████████| 54/54 [00:03<00:00, 16.12it/s]


Train Loss : 0.0018, Validation Loss: 0.2365


Epoch 24/50: 100%|██████████| 54/54 [00:03<00:00, 16.32it/s]


Train Loss : 0.0016, Validation Loss: 0.2475
No improvement for 15 epochs. Stopping early.
Saved best model to 'bilstm_custom_embeddings_v5/bilstm_custom_embeddings_v5-best.pt'
time: 1min 55s (started: 2023-11-10 07:49:08 +00:00)


In [24]:
precision, recall, f1 = evaluate_model(best_model, valid_data_loader)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

100%|██████████| 51/51 [00:00<00:00, 76.14it/s]


processed 51362 tokens with 5942 phrases; found: 5708 phrases; correct: 4511.
accuracy:  78.57%; (non-O)
accuracy:  95.47%; precision:  79.03%; recall:  75.92%; FB1:  77.44
              LOC: precision:  84.98%; recall:  82.85%; FB1:  83.90  1791
             MISC: precision:  83.84%; recall:  71.48%; FB1:  77.17  786
              ORG: precision:  69.89%; recall:  68.90%; FB1:  69.40  1322
              PER: precision:  77.72%; recall:  76.33%; FB1:  77.02  1809
Precision: 79.03, Recall: 75.92, F1 Score: 77.44
time: 774 ms (started: 2023-11-10 07:51:11 +00:00)


In [25]:
precision, recall, f1 = evaluate_model(best_model, test_data_loader)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

100%|██████████| 108/108 [00:01<00:00, 99.48it/s]


processed 46435 tokens with 5648 phrases; found: 5353 phrases; correct: 3837.
accuracy:  72.65%; (non-O)
accuracy:  93.66%; precision:  71.68%; recall:  67.94%; FB1:  69.76
              LOC: precision:  79.67%; recall:  74.94%; FB1:  77.23  1569
             MISC: precision:  67.86%; recall:  59.26%; FB1:  63.27  613
              ORG: precision:  64.91%; recall:  63.15%; FB1:  64.02  1616
              PER: precision:  72.15%; recall:  69.39%; FB1:  70.74  1555
Precision: 71.68, Recall: 67.94, F1 Score: 69.76
time: 1.25 s (started: 2023-11-10 07:51:18 +00:00)


# References

1. https://huggingface.co/datasets/conll2003
2. https://huggingface.co/docs/datasets/installation
3. https://huggingface.co/docs/transformers/installation
4. https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
5. https://stackoverflow.com/a/52070223/12639940
6. https://github.com/sighsmile/conlleval
7. https://nlp.stanford.edu/data/glove.6B.zip
8. https://stats.stackexchange.com/questions/248715/selection-of-values-for-padding-tokens-in-sentence-classification-with-word-embe
