# IMPORTS & CONFIG

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
import pickle
import numpy as np
from sklearn.metrics import classification_report

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device =", DEVICE)


Device = cuda


# LOAD EMBEDDINGS FROM PART 1

In [2]:
emb = np.load("/kaggle/input/custom-word2vec-output/embeddings.npy")  
with open("/kaggle/input/custom-word2vec-output/word_to_idx.pkl", "rb") as f:
    word_to_idx = pickle.load(f)

orig_vocab_size, EMBED_DIM = emb.shape
print("Original vocab:", orig_vocab_size, "Embedding dim:", EMBED_DIM)


PAD_ID = 0
UNK_ID = 1

new_emb = np.zeros((orig_vocab_size + 2, EMBED_DIM), dtype=np.float32)
new_emb[2:] = emb                              
new_emb[UNK_ID] = np.random.uniform(-0.01,0.01,EMBED_DIM)  

# shift word_to_idx by +2
new_w2i = {"<PAD>":0, "<UNK>":1}
for w, i in word_to_idx.items():
    new_w2i[w] = i + 2

word_to_idx = new_w2i
embedding_matrix = torch.tensor(new_emb, dtype=torch.float32)
vocab_size = embedding_matrix.shape[0]

print("Final vocab with PAD/UNK:", vocab_size)

Original vocab: 14068 Embedding dim: 100
Final vocab with PAD/UNK: 14070


# DATA PREPARATION

In [3]:
dataset = load_dataset("lhoestq/conll2003")

train_split = dataset["train"]
val_split = dataset["validation"]
test_split = dataset["test"]


dataset_infos.json: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/281k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

# WINDOW DATASET

In [4]:
class WindowNERDataset(Dataset):
    def __init__(self, split, word_to_idx, window_size):
        self.window_size = window_size
        self.pad = word_to_idx["<PAD>"]
        self.unk = word_to_idx["<UNK>"]
        self.samples = []

        for entry in split:
            tokens = entry["tokens"]
            labels = entry["ner_tags"]

            idxs = [word_to_idx.get(t.lower(), self.unk) for t in tokens]

            padded = [self.pad]*window_size + idxs + [self.pad]*window_size

            for i in range(len(tokens)):
                window = padded[i:i + 2*window_size + 1]
                self.samples.append((window, labels[i]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        w, y = self.samples[idx]
        return torch.tensor(w, dtype=torch.long), torch.tensor(y, dtype=torch.long)


# FEED FORWARD TAGGER

In [5]:
class FFNTagger(nn.Module):
    def __init__(self, embedding_matrix, window, hidden=256, hidden2=128, num_classes=9):
        super().__init__()

        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix,
            freeze=True      
        )

        input_dim = (2*window + 1) * embedding_matrix.size(1)

        self.ffn = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden, hidden2),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden2, num_classes)
        )

    def forward(self, x):
        emb = self.embedding(x)                      # (B, window, D)
        concat = emb.view(emb.size(0), -1)           # (B, window*D)
        return self.ffn(concat)                      # (B, num_classes)


# BUILD DATASETS & LOADERS

In [6]:
WINDOW = 2
HIDDEN_DIM = 256
NUM_CLASSES = 9

BATCH_SIZE = 128
EPOCHS = 60        
LR = 3e-3          

train_ds = WindowNERDataset(train_split, word_to_idx, WINDOW)
val_ds   = WindowNERDataset(val_split, word_to_idx, WINDOW)
test_ds  = WindowNERDataset(test_split,  word_to_idx, WINDOW)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE)

print("Train samples =", len(train_ds))
print("Val samples   =", len(val_ds))
print("Test samples  =", len(test_ds))


Train samples = 203621
Val samples   = 51362
Test samples  = 46435


# TRAINING & EVALUATION

In [7]:
def evaluate(model, loader, loss_fn):
    model.eval()
    total_loss = 0
    y_true, y_pred = [], []

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            logits = model(x)
            loss = loss_fn(logits, y)
            total_loss += loss.item()

            preds = logits.argmax(1)
            y_true += y.cpu().tolist()
            y_pred += preds.cpu().tolist()

    avg_loss = total_loss / len(loader)
    report = classification_report(y_true, y_pred, digits=4)
    return avg_loss, report


# TRAINING LOOP

In [8]:
model = FFNTagger(embedding_matrix, WINDOW, HIDDEN_DIM, NUM_CLASSES).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss()


scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,       
    patience=3,       
    verbose=True
)

best_val_loss = float("inf")


patience_limit = 6
patience_counter = 0

for epoch in range(1, EPOCHS+1):

    model.train()
    total_train = 0

    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)

        optimizer.zero_grad()
        logits = model(x)
        loss = loss_fn(logits, y)
        loss.backward()
        optimizer.step()

        total_train += loss.item()

    avg_train_loss = total_train / len(train_loader)
    avg_val_loss, val_report = evaluate(model, val_loader, loss_fn)

    
    scheduler.step(avg_val_loss)

    print(f"\nEpoch {epoch}/{EPOCHS}")
    print(f"Train Loss = {avg_train_loss:.4f}")
    print(f"Val Loss   = {avg_val_loss:.4f}")
    print("Validation Report:")
    print(val_report)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0                     
        torch.save(model.state_dict(), "best_ffn_model.pt")
        print(" Saved best model!")
    else:
        patience_counter += 1
        print(f"No improvement ({patience_counter}/{patience_limit})")

    # --- Early stopping break ---
    if patience_counter >= patience_limit:
        print("Early stopping triggered")
        break


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1/60
Train Loss = 0.4713
Val Loss   = 0.3406
Validation Report:
              precision    recall  f1-score   support

           0     0.9116    0.9949    0.9515     42759
           1     0.8496    0.6412    0.7308      1842
           2     0.9005    0.6924    0.7829      1307
           3     0.6760    0.4124    0.5123      1341
           4     0.8636    0.0253    0.0492       751
           5     0.8373    0.6527    0.7336      1837
           6     0.8710    0.1051    0.1875       257
           7     0.0000    0.0000    0.0000       922
           8     0.0000    0.0000    0.0000       346

    accuracy                         0.9039     51362
   macro avg     0.6566    0.3915    0.4386     51362
weighted avg     0.8769    0.9039    0.8795     51362

 Saved best model!


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 2/60
Train Loss = 0.3906
Val Loss   = 0.3124
Validation Report:
              precision    recall  f1-score   support

           0     0.9159    0.9965    0.9545     42759
           1     0.8806    0.6688    0.7603      1842
           2     0.9159    0.7253    0.8096      1307
           3     0.8518    0.3729    0.5187      1341
           4     0.8889    0.0852    0.1555       751
           5     0.8303    0.7246    0.7738      1837
           6     0.8824    0.2335    0.3692       257
           7     0.9200    0.0748    0.1384       922
           8     0.0000    0.0000    0.0000       346

    accuracy                         0.9115     51362
   macro avg     0.7873    0.4313    0.4978     51362
weighted avg     0.9032    0.9115    0.8903     51362

 Saved best model!

Epoch 3/60
Train Loss = 0.3645
Val Loss   = 0.2854
Validation Report:
              precision    recall  f1-score   support

           0     0.9263    0.9947    0.9593     42759
           1     0.8578  

# FINAL TEST EVALUATION

In [9]:
print("\nLoading best model...")
model.load_state_dict(torch.load("best_ffn_model.pt"))
model.to(DEVICE)

test_loss, test_report = evaluate(model, test_loader, loss_fn)

print("\n======== FINAL TEST RESULTS ========")
print("Test Loss =", test_loss)
print(test_report)
print("====================================")



Loading best model...

Test Loss = 0.2588199995504307
              precision    recall  f1-score   support

           0     0.9497    0.9877    0.9683     38323
           1     0.7897    0.8312    0.8099      1617
           2     0.8669    0.8227    0.8442      1156
           3     0.7715    0.5611    0.6497      1661
           4     0.7602    0.4707    0.5814       835
           5     0.8550    0.7494    0.7987      1668
           6     0.8489    0.4591    0.5960       257
           7     0.7343    0.3504    0.4744       702
           8     0.7563    0.4167    0.5373       216

    accuracy                         0.9298     46435
   macro avg     0.8147    0.6277    0.6955     46435
weighted avg     0.9241    0.9298    0.9237     46435

