## 1. Import Libraries

Import all required libraries for data loading, preprocessing, model training, and evaluation.

In [1]:
import os
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, f1_score

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


## 2. Load Labeled Data (CoNLL Format)

Load the manually labeled NER data in CoNLL format. Each token and its label are read and grouped into sentences/messages.

In [3]:
conll_path = "../data/labeled/ner_labeled_sample.conll"

def read_conll(path):
    tokens, labels, sentences = [], [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
            else:
                splits = line.split()
                tokens.append(splits[0])
                labels.append(splits[1])
        if tokens:
            sentences.append({"tokens": tokens, "ner_tags": labels})
    return sentences

data = read_conll(conll_path)
dataset = Dataset.from_list(data)

## 3. Map Labels to IDs

Define the list of entity labels and map them to integer IDs, which are required for model training.

In [4]:
label_list = ["O", "B-Product", "I-Product", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

def encode_labels(example):
    example["labels"] = [label_to_id[l] for l in example["ner_tags"]]
    return example

dataset = dataset.map(encode_labels)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 39/39 [00:00<00:00, 135.79 examples/s]


## 4. Tokenize the Data

Tokenize the sentences using the model's tokenizer and align the entity labels with the resulting tokens.

In [5]:
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["labels"][word_idx])
        else:
            labels.append(example["labels"][word_idx])
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 39/39 [00:01<00:00, 33.56 examples/s]


## 5. Split Data into Training and Validation Sets

Split the dataset into training and validation sets to evaluate model performance during training.

In [6]:
split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

## 6. Load Pre-trained Model

Load a pre-trained transformer model for token classification (e.g., XLM-Roberta) and configure it for the NER task.

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label=id_to_label, label2id=label_to_id
)

## 7. Set Up Training Arguments

Configure the training parameters, such as learning rate, batch size, number of epochs, and evaluation strategy.

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)



## 8. Train the Model

Initialize the Hugging Face Trainer and start fine-tuning the model on the labeled NER dataset.

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

## 9. Evaluate the Model

Evaluate the fine-tuned model on the validation set and print classification metrics such as F1-score.

In [None]:
predictions, labels, _ = trainer.predict(eval_dataset)
preds = np.argmax(predictions, axis=2)

true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
pred_labels = [[id_to_label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

print(classification_report(true_labels, pred_labels))
print("F1-score:", f1_score(true_labels, pred_labels))

## 10. Save the Fine-tuned Model

Save the trained NER model for future inference or further fine-tuning.

In [None]:
trainer.save_model("./finetuned-ner-model")