## **Problem 7: Transformers**

# Question 1:

# Part 1.

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from seqeval.metrics import classification_report, f1_score, accuracy_score
import torch
import numpy as np
import random
import warnings
warnings.filterwarnings("ignore")

def load_clean_ner_data(tokens_path, labels_path):
    with open(tokens_path, "r", encoding="utf-8") as token_file, open(labels_path, "r", encoding="utf-8") as label_file:
        token_lines = token_file.readlines()
        label_lines = label_file.readlines()

    sentences, labels = [], []

    for t_line, l_line in zip(token_lines, label_lines):
        tokens = t_line.strip().split()
        lbls = l_line.strip().split()
        if len(tokens) == len(lbls) and len(tokens) > 0:
            sentences.append([t.strip() for t in tokens])
            labels.append([l.strip().lower().replace("_", "-") for l in lbls])
    return sentences, labels

# Read and merge ARMAN & PEYMA datasets
arman_sent, arman_lab = load_clean_ner_data("arman-tokens.txt", "arman-labels.txt")
peyma_sent, peyma_lab = load_clean_ner_data("peyma-tokens.txt", "peyma-labels.txt")
all_sentences = arman_sent + peyma_sent
all_labels = arman_lab + peyma_lab

# Combine and shuffle all data, then split into train (%80) and test (%20) sets
combined = list(zip(all_sentences, all_labels))
random.seed(42)
random.shuffle(combined)
split_idx = int(0.8 * len(combined))
train_sentences, train_labels = zip(*combined[:split_idx])
test_sentences, test_labels = zip(*combined[split_idx:])

all_labels_unique = sorted(set(l for seq in (train_labels + test_labels) for l in seq))
label2id = {l: i for i, l in enumerate(all_labels_unique)}
id2label = {i: l for l, i in label2id.items()}
print("Total Labels:", all_labels_unique)

# Load pre-trained Persian BERT model and tokenizer
model_name = "HooshvareLab/bert-fa-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
).to("cuda")

def tokenize_and_align_labels(example):
    # Tokenize with word alignment and padding
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    word_ids = tokenized.word_ids()   # Map tokens to word indices
    labels = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["ner_tags"][word_idx]])
        else:
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

# Convert token-label pairs to Hugging Face datasets
train_dataset = Dataset.from_dict({"tokens": train_sentences, "ner_tags": train_labels})
test_dataset = Dataset.from_dict({"tokens": test_sentences, "ner_tags": test_labels})

train_dataset = train_dataset.map(tokenize_and_align_labels, remove_columns=["tokens", "ner_tags"])
test_dataset = test_dataset.map(tokenize_and_align_labels, remove_columns=["tokens", "ner_tags"])

# Configure and launch BERT NER training
training_args = TrainingArguments(
    output_dir="./bert-ner-fa",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)
trainer.train()

# Predict on test set 
predictions, labels, _ = trainer.predict(test_dataset)
pred_tags = np.argmax(predictions, axis=2)

true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
pred_labels = [[id2label[p] for p, l in zip(pred_seq, label_seq) if l != -100]
               for pred_seq, label_seq in zip(pred_tags, labels)]

# compute evaluation metrics
accuracy = 100 * accuracy_score(true_labels, pred_labels)
f1 = 100 * f1_score(true_labels, pred_labels, average="weighted")

print(f"\nAccuracy: {accuracy:.4f}%")
print(f"F1_Score: {f1:.4f}%")
print(classification_report(true_labels, pred_labels))

Total Labels: ['b-dat', 'b-event', 'b-fac', 'b-loc', 'b-mon', 'b-org', 'b-pct', 'b-per', 'b-pers', 'b-pro', 'b-tim', 'i-dat', 'i-event', 'i-fac', 'i-loc', 'i-mon', 'i-org', 'i-pct', 'i-per', 'i-pers', 'i-pro', 'i-tim', 'o']


Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 26417/26417 [00:13<00:00, 1890.10 examples/s]
Map: 100%|██████████| 6605/6605 [00:03<00:00, 2080.22 examples/s]


Step,Training Loss
500,0.1967
1000,0.1201
1500,0.0898
2000,0.0796
2500,0.0728
3000,0.0631
3500,0.0471
4000,0.0305
4500,0.0289
5000,0.0264



Accuracy: 99.2649%
F1_Score: 94.4127%
              precision    recall  f1-score   support

           _       0.94      0.95      0.94     10390
         dat       0.80      0.81      0.81       357
       event       0.93      0.96      0.94       396
         fac       0.96      0.99      0.97       281
         loc       0.96      0.96      0.96      3238
         mon       0.94      0.92      0.93       112
         org       0.95      0.96      0.95      3939
         pct       0.87      0.85      0.86        71
         per       0.93      0.86      0.89       925
        pers       0.94      0.99      0.97      1855
         pro       0.94      0.99      0.96       417
         tim       0.58      0.77      0.66        53

   micro avg       0.94      0.95      0.94     22034
   macro avg       0.89      0.92      0.90     22034
weighted avg       0.94      0.95      0.94     22034

