# train_models.ipynb
In this notebook we train perceptron and 2 deep learning models.

In [1]:
# IMPORTS
import os
import pickle
from collections import defaultdict, Counter
import numpy as np
import pandas as pd

# PyTorch
import torch
import torch.optim as optim
import torch.nn as nn
from torch import nn  # kept for completeness; can be removed if only using one
from torch.utils.data import DataLoader, Dataset, TensorDataset

# Scikit-learn
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Hugging Face Transformers & datasets
from datasets import Dataset, Features, Sequence, Value, ClassLabel
from transformers import (
    BertTokenizerFast,
    BertForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
)

# skseq (as given to us in the course)
import skseq
from skseq.sequences import sequence
from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.label_dictionary import LabelDictionary
from skseq.sequences.extended_feature import ExtendedFeatures
from skseq.sequences.id_feature import IDFeatures
from skseq.readers.pos_corpus import PostagCorpus
from skseq.sequences import structured_perceptron as spc


from utils.utils import BiLSTM_NER, group_sentences, build_vocab, load_data


## 1. Perceptron training

In [5]:
# Load dataset
train = pd.read_csv("data/train_data_ner.csv", keep_default_na=False)

# Initialise label dictionaries
word_dict = LabelDictionary()
tag_dict = LabelDictionary()

for word in train['words'].unique():
    if word not in word_dict:
        word_dict.add(word)
        
for tag in train['tags'].unique():
    if tag not in tag_dict:
        tag_dict.add(tag)
        
# Initialize SequenceList
train_seq = SequenceList(word_dict, tag_dict)

# Group by sentence_id
for _, group in train.groupby("sentence_id"):
    words = list(group["words"])
    tags = list(group["tags"])
    train_seq.add_sequence(words, tags, word_dict,tag_dict )

# printing an example
print(train_seq[1].to_words(train_seq))


Helicopter/O gunships/O Saturday/B-tim pounded/O militant/O hideouts/O in/O the/O Orakzai/B-geo tribal/O region/O ,/O where/O many/O Taliban/B-org militants/O are/O believed/O to/O have/O fled/O to/O avoid/O an/O earlier/O military/O offensive/O in/O nearby/O South/B-geo Waziristan/I-geo ./O 


In [6]:
print(train_seq[0].to_words(train_seq))


Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O 


In [7]:
len(train_seq)

38366

In [8]:
# %%time

# feature_mapper = IDFeatures(train_seq)
# feature_mapper.build_features()
# sp1 = spc.StructuredPerceptron(word_dict, tag_dict, feature_mapper)
# num_epochs = 1
# sp.fit(feature_mapper.dataset, num_epochs)

In [9]:
%%time
extended_features = ExtendedFeatures(train_seq)
extended_features.build_features()
sp = spc.StructuredPerceptron(word_dict, tag_dict, extended_features)
num_epochs = 5 #2
sp.fit(extended_features.dataset, num_epochs)

Epoch: 0 Accuracy: 0.932807
Epoch: 1 Accuracy: 0.945732
Epoch: 2 Accuracy: 0.948930
Epoch: 3 Accuracy: 0.951333
Epoch: 4 Accuracy: 0.952946
CPU times: user 10min 36s, sys: 18.8 s, total: 10min 55s
Wall time: 10min 39s


In [10]:
sp.save_model("fitted_models/model1/parameters6")
print("Trained & saved.")


## 2. Deep learning model 1 (BiLSTM) training

In [15]:

# Load data
train_df = pd.read_csv("data/train_data_ner.csv")
train_sentences = group_sentences(train_df)  # returns list of (words, tags) tuples

# Create vocabularies
word2idx = {"<PAD>": 0, "<UNK>": 1}
tag2idx = {}
idx2tag = {}

for words, tags in train_sentences:  # Correct unpacking here
    for word, tag in zip(words, tags):
        if word not in word2idx:
            word2idx[word] = len(word2idx)
        if tag not in tag2idx:
            tag2idx[tag] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}

# Encode sentences
max_len = max(len(words) for words, tags in train_sentences)

X, y = [], []

for words, tags in train_sentences:
    x_seq = [word2idx.get(word, word2idx["<UNK>"]) for word in words]
    y_seq = [tag2idx[tag] for tag in tags]

    # Padding
    x_seq += [word2idx["<PAD>"]] * (max_len - len(x_seq))
    y_seq += [0] * (max_len - len(y_seq))  # 0 for <PAD> label

    X.append(x_seq)
    y.append(y_seq)

X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)

dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_NER(vocab_size=len(word2idx), tagset_size=len(tag2idx)).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
model.train()
for epoch in range(5):
    total_loss = 0
    for batch_x, batch_y in loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        logits = model(batch_x)
        logits = logits.view(-1, logits.shape[-1])
        batch_y = batch_y.view(-1)

        loss = loss_fn(logits, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")

# Save model and mappings
os.makedirs("fitted_models/model2", exist_ok=True)
torch.save(model.state_dict(), "fitted_models/model2/lstm_model.pt")

with open("fitted_models/model2/idx_mappings.pkl", "wb") as f:
    pickle.dump({
        "word2idx": word2idx,
        "tag2idx": tag2idx,
        "idx2tag": idx2tag
    }, f)

print("BiLSTM model saved.")


Epoch 1 Loss: 135.7674
Epoch 2 Loss: 43.7626
Epoch 3 Loss: 31.2912
Epoch 4 Loss: 24.8312
Epoch 5 Loss: 20.5942
BiLSTM model saved.


## 3. Deep learning model 2 (BERT) training

In [2]:
import transformers
from datasets import Sequence, Value, Features

os.environ["USE_TF"] = "0"

# Load and preprocess data
train_df, test_df, _ = load_data('data/train_data_ner.csv', 'data/test_data_ner.csv', 'data/tiny_test.csv')
train_data = group_sentences(train_df)
test_data = group_sentences(test_df)

#  Build vocabulary for tags only 
_, tag2idx = build_vocab(train_data)
idx2tag = {i: t for t, i in tag2idx.items()}

def to_hf_format(data, tag2idx):
    hf_data = []
    for words, tags in data:
        tokens = list(words)  # list of str
        label_ids = [tag2idx[tag] for tag in tags]  # list of int
        hf_data.append({
            "tokens": tokens,
            "labels": label_ids
        })
    return hf_data

hf_train_data = to_hf_format(train_data, tag2idx)



features = Features({
    "tokens": Sequence(Value("string")),
    "labels": Sequence(Value("int64"))
})

train_dataset = Dataset.from_list(to_hf_format(train_data, tag2idx), features=features)
test_dataset = Dataset.from_list(to_hf_format(test_data, tag2idx), features=features)



# Tokenizer and align labels
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding='max_length', max_length=100)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Define model and training arguments
#model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2idx))

#the below is to use apple silicon metal and speed up the process 

# Select MPS if available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple GPU)")
else:
    device = torch.device("cpu")
    print("MPS not available, using CPU")

# Load model and move it to the selected device
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2idx)).to(device)


args = TrainingArguments(
    output_dir="bert_ner_output",
    evaluation_strategy="epoch",       # <== must match save_strategy
    save_strategy="epoch",             # <== must match evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# Collator to pad inputs and labels correctly during training
data_collator = DataCollatorForTokenClassification(tokenizer)


# Collator to pad inputs and labels correctly during training
data_collator = DataCollatorForTokenClassification(tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

#  Train and save
trainer.train()
model.save_pretrained("fitted_models/model3")
tokenizer.save_pretrained("fitted_models/model3")




Map:   0%|          | 0/38366 [00:00<?, ? examples/s]

Map:   0%|          | 0/38367 [00:00<?, ? examples/s]

Using MPS (Apple GPU)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1386,0.402591
2,0.0672,0.516475
3,0.0471,0.536799




('fitted_models/model3/tokenizer_config.json',
 'fitted_models/model3/special_tokens_map.json',
 'fitted_models/model3/vocab.txt',
 'fitted_models/model3/added_tokens.json',
 'fitted_models/model3/tokenizer.json')