### Package Installation

In [None]:
pip install datasets seqeval scikit-learn



In [None]:
%pip install transformers==4.44.2



In [None]:
%pip install -U wandb



In [None]:
%pip install --upgrade datasets



### Libraries

In [None]:
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
import torch
from seqeval.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from google.colab import userdata
import wandb
import os

### Loading Data & Preprocessing

In [None]:

dataset = load_dataset("surrey-nlp/PLOD-CW-25")
dataset

README.md:   0%|          | 0.00/268 [00:00<?, ?B/s]

PLOD-CW-25-Train.parquet:   0%|          | 0.00/343k [00:00<?, ?B/s]

PLOD-CW-25-Test.parquet:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

PLOD-CW-25-Val.parquet:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/150 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 150
    })
})

In [None]:
label_list = list(set(tag for instance in dataset["train"]["ner_tags"] for tag in instance))
print(label_list)
num_labels = len(label_list)
num_labels

['B-LF', 'B-AC', 'O', 'I-LF']


4

In [None]:
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
def encode_labels(example):
    example["ner_tags"] = [label2id[label] for label in example["ner_tags"]]
    return example

encoded_dataset = dataset.map(encode_labels)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

### Load tokenizer and model

In [None]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.id2label = id2label
model.config.label2id = label2id

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = encoded_dataset.map(tokenize_and_align_labels, batched=True)

data_collator = DataCollatorForTokenClassification(tokenizer)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

### Configuring Training Parameters

In [None]:

training_args = TrainingArguments(
    output_dir="./distilbert-ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="wandb"
)




In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

### Training the Model

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.283131,0.721913,0.797297,0.757735,0.899567
2,No log,0.277364,0.725053,0.839066,0.777904,0.902398
3,No log,0.268461,0.746085,0.81941,0.78103,0.904397
4,0.230900,0.272559,0.743011,0.848894,0.792431,0.906229
5,0.230900,0.281814,0.749179,0.840295,0.792125,0.908228
6,0.230900,0.284578,0.738147,0.841523,0.786452,0.907728


TrainOutput(global_step=750, training_loss=0.19600855763753255, metrics={'train_runtime': 213.8266, 'train_samples_per_second': 56.12, 'train_steps_per_second': 3.508, 'total_flos': 483176719049088.0, 'train_loss': 0.19600855763753255, 'epoch': 6.0})

In [None]:
# wandb.finish()

0,1
eval/accuracy,▁▃▅▆██
eval/f1,▁▅▆██▇
eval/loss,▇▅▁▃▇█
eval/precision,▁▂▇▆█▅
eval/recall,▁▇▄█▇▇
eval/runtime,▁▁█▃▂▃
eval/samples_per_second,█▇▁▆▇▆
eval/steps_per_second,█▇▁▆▇▆
train/epoch,▁▂▄▅▅▇██
train/global_step,▁▂▄▅▅▇██

0,1
eval/accuracy,0.90773
eval/f1,0.78645
eval/loss,0.28458
eval/precision,0.73815
eval/recall,0.84152
eval/runtime,0.6867
eval/samples_per_second,218.432
eval/steps_per_second,14.562
total_flos,483176719049088.0
train/epoch,6.0


### Evaluating Model Performance on test Data

In [None]:
# wandb.init()

In [None]:
print("\nEvaluating on test set...")
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])

pred_ids = np.argmax(predictions, axis=2)

true_labels = []
true_predictions = []

for prediction, label in zip(pred_ids, labels):
    true_label = []
    true_pred = []
    for p, l in zip(prediction, label):
        if l != -100:
            true_label.append(id2label[l])
            true_pred.append(id2label[p])
    true_labels.append(true_label)
    true_predictions.append(true_pred)

print("\nClassification Report on Test Set:\n")
print(classification_report(true_labels, true_predictions))

precision = precision_score(true_labels, true_predictions)
recall = recall_score(true_labels, true_predictions)
f1 = f1_score(true_labels, true_predictions)
accuracy = accuracy_score(true_labels, true_predictions)

print(f"\nFinal Test Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"Accuracy : {accuracy:.4f}")

# # Optionally, preview sample predictions
# for i in range(3):
#     tokens = tokenized_datasets["test"][i]["tokens"]
#     labels_ = tokenized_datasets["test"][i]["labels"]
#     preds = pred_ids[i]

#     print(f"\nSample {i+1}:")
#     for token, label_id, pred_id in zip(tokens, labels_, preds):
#         if label_id != -100:
#             print(f"{token:15}  True: {id2label[label_id]:5}  Pred: {id2label[pred_id]:5}")



Evaluating on test set...



Classification Report on Test Set:

              precision    recall  f1-score   support

          AC       0.84      0.93      0.88       797
          LF       0.68      0.81      0.74       482

   micro avg       0.78      0.89      0.83      1279
   macro avg       0.76      0.87      0.81      1279
weighted avg       0.78      0.89      0.83      1279


Final Test Metrics:
Precision: 0.7772
Recall   : 0.8866
F1-score : 0.8283
Accuracy : 0.9339


In [None]:
# wandb.finish()

0,1
test/accuracy,▁
test/f1,▁
test/loss,▁
test/precision,▁
test/recall,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁

0,1
test/accuracy,0.9339
test/f1,0.82834
test/loss,0.18545
test/precision,0.77724
test/recall,0.88663
test/runtime,1.2676
test/samples_per_second,197.23
test/steps_per_second,12.623
