### Prepare Dataset

In [1]:
def load_pos_data(path):
    dataset = []
    with open(path, "r", encoding="utf-8") as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                # End of a sentence
                if tokens:
                    dataset.append({"tokens": tokens, "tags": tags})
                    tokens = []
                    tags = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    word, tag = parts
                    tokens.append(word)
                    tags.append(tag)
        # Catch the last sentence if no newline at EOF
        if tokens:
            dataset.append({"tokens": tokens, "tags": tags})
    return dataset


In [2]:
print(load_pos_data("sinhala_pos.txt")[0:2])

[{'tokens': ['ඊශ්රායල්', 'මිසයිල', 'ප්රහාර', 'වලින්', 'පලස්තීනුවෝ', '4', 'ක්', 'මිය', 'යති', '.'], 'tags': ['NNP', 'NNJ', 'NNC', 'CM', 'NNP', 'NUM', 'RP', 'RRPCV', 'VFM', 'FS']}, {'tokens': ['ගාසා', 'තීරයේදී', '.'], 'tags': ['NNP', 'NNP', 'FS']}]


In [3]:
from datasets import Dataset, DatasetDict
import random

# all_data = load_pos_data("sinhala_pos.txt")
# random.shuffle(all_data)

# # Optional: 80% train, 20% test split
# split_idx = int(0.8 * len(all_data))
# train_data = all_data[:split_idx]
# test_data = all_data[split_idx:]

# dataset = DatasetDict({
#     "train": Dataset.from_list(train_data),
#     "test": Dataset.from_list(test_data),
# })

data = load_pos_data("sinhala_pos.txt")

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2)


  from .autonotebook import tqdm as notebook_tqdm


### tag2id mapping

In [4]:
unique_tags = set(tag for example in data for tag in example["tags"])

tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
label_list = list(tag2id.keys())

id2tag = {i: tag for tag, i in tag2id.items()}


### Tokenize and Align Labels

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"], 
        is_split_into_words=True, 
        truncation=True, 
        padding="max_length",      # Pad to max length of the model or your max_length param
        max_length=256,            # or any max_length you want (optional)
        return_tensors=None        # don't convert to tensors here; Trainer does it later
    )
    
    word_ids = tokenized.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(tag2id[example["tags"][word_idx]])
        else:
            # Label only the first sub-token
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)


Map: 100%|██████████| 9040/9040 [00:07<00:00, 1172.91 examples/s]
Map: 100%|██████████| 2261/2261 [00:01<00:00, 1363.66 examples/s]


### Define the Model

In [6]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id,
    local_files_only=True
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train the Model

In [7]:
from transformers import TrainingArguments, Trainer
import numpy as np
# from seqeval.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import accuracy_score, f1_score, classification_report

# def compute_metrics(p):
#     preds = np.argmax(p.predictions, axis=2)
#     labels = p.label_ids

#     true_preds = [
#         [id2tag[p] for (p, l) in zip(pred_seq, label_seq) if l != -100]
#         for pred_seq, label_seq in zip(preds, labels)
#     ]
#     true_labels = [
#         [id2tag[l] for (p, l) in zip(pred_seq, label_seq) if l != -100]
#         for pred_seq, label_seq in zip(preds, labels)
#     ]

#     return {
#         "accuracy": accuracy_score(true_labels, true_preds),
#         "report": classification_report(true_labels, true_preds),
#     }

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    # Convert IDs to tag names, skip padding (-100)

    
    true_preds = [
        id2tag[p] for pred_seq, label_seq in zip(preds, labels)
        for p, l in zip(pred_seq, label_seq) if l != -100
    ]
    true_labels = [
        id2tag[l] for pred_seq, label_seq in zip(preds, labels)
        for p, l in zip(pred_seq, label_seq) if l != -100
    ]

    # Optional: print detailed report to console (not return)
    print(classification_report(true_labels, true_preds, digits=4))

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds, average="weighted"),
        # Avoid printing report here in returned dict — it's too long
        # Use print manually if needed:
        # print(classification_report(true_labels, true_preds))
    }

training_args = TrainingArguments(
    output_dir="./pos-xlm-r",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7057,0.343603,0.905514,0.904922


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9433    0.9765    0.9596       426
         AUX     0.9228    0.9567    0.9394       300
          CC     0.9705    0.9578    0.9641       687
          CM     0.9429    0.9516    0.9472       434
         DET     0.9556    0.9597    0.9576      1165
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9991    1.0000    0.9996      2252
         JCV     0.6689    0.6527    0.6607       622
          JJ     0.8517    0.8139    0.8324      3557
         NCV     0.7161    0.8026    0.7569       927
         NDT     0.0000    0.0000    0.0000        14
         NIP     0.9728    0.9603    0.9665       857
          NN     0.0000    0.0000    0.0000         1
         NNC     0.9120    0.8966    0.9042     12213
         NNJ     0.6509    0.7770    0.7084      1296
         NNP     0.9302    0.9212    0.9257      5063
         NNp     0.0000    0.0000    0.0000         1
         NUM     0.9392    

TrainOutput(global_step=565, training_loss=0.6697956051446695, metrics={'train_runtime': 4894.5893, 'train_samples_per_second': 1.847, 'train_steps_per_second': 0.115, 'total_flos': 1181488456581120.0, 'train_loss': 0.6697956051446695, 'epoch': 1.0})

### Evaluate

In [8]:
trainer.evaluate()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9433    0.9765    0.9596       426
         AUX     0.9228    0.9567    0.9394       300
          CC     0.9705    0.9578    0.9641       687
          CM     0.9429    0.9516    0.9472       434
         DET     0.9556    0.9597    0.9576      1165
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9991    1.0000    0.9996      2252
         JCV     0.6689    0.6527    0.6607       622
          JJ     0.8517    0.8139    0.8324      3557
         NCV     0.7161    0.8026    0.7569       927
         NDT     0.0000    0.0000    0.0000        14
         NIP     0.9728    0.9603    0.9665       857
          NN     0.0000    0.0000    0.0000         1
         NNC     0.9120    0.8966    0.9042     12213
         NNJ     0.6509    0.7770    0.7084      1296
         NNP     0.9302    0.9212    0.9257      5063
         NNp     0.0000    0.0000    0.0000         1
         NUM     0.9392    

{'eval_loss': 0.3436029255390167,
 'eval_accuracy': 0.90551371868123,
 'eval_f1': 0.9049217239183525,
 'eval_runtime': 112.1416,
 'eval_samples_per_second': 20.162,
 'eval_steps_per_second': 1.266,
 'epoch': 1.0}

In [9]:
from sklearn.metrics import classification_report

# Get predictions
predictions_output = trainer.predict(tokenized_dataset["test"])
preds = predictions_output.predictions.argmax(-1)
labels = predictions_output.label_ids

# Flatten predictions, ignoring -100
true_labels = []
predicted_labels = []

for pred_seq, label_seq in zip(preds, labels):
    for pred, label in zip(pred_seq, label_seq):
        if label != -100:
            true_labels.append(label)
            predicted_labels.append(pred)

# Dynamically detect used label IDs
used_label_ids = sorted(set(true_labels + predicted_labels))

# Match label IDs to tag names
target_names = [id2tag[i] for i in used_label_ids]

# Evaluate safely with matching label set
print(classification_report(
    true_labels,
    predicted_labels,
    labels=used_label_ids,
    target_names=target_names,
    zero_division=0  # prevents warnings for unseen labels
))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9433    0.9765    0.9596       426
         AUX     0.9228    0.9567    0.9394       300
          CC     0.9705    0.9578    0.9641       687
          CM     0.9429    0.9516    0.9472       434
         DET     0.9556    0.9597    0.9576      1165
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9991    1.0000    0.9996      2252
         JCV     0.6689    0.6527    0.6607       622
          JJ     0.8517    0.8139    0.8324      3557
         NCV     0.7161    0.8026    0.7569       927
         NDT     0.0000    0.0000    0.0000        14
         NIP     0.9728    0.9603    0.9665       857
          NN     0.0000    0.0000    0.0000         1
         NNC     0.9120    0.8966    0.9042     12213
         NNJ     0.6509    0.7770    0.7084      1296
         NNP     0.9302    0.9212    0.9257      5063
         NNp     0.0000    0.0000    0.0000         1
         NUM     0.9392    

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Step 1: Evaluate overall metrics using Hugging Face Trainer
eval_results = trainer.evaluate()
print("🔍 Evaluation Metrics from trainer.evaluate():")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# Step 2: Run predictions
predictions_output = trainer.predict(tokenized_dataset["test"])
predictions = predictions_output.predictions
label_ids = predictions_output.label_ids

# Step 3: Get most likely tag indices
pred_labels = np.argmax(predictions, axis=2)

# Step 4: Prepare true and predicted tag names (flattened)
true_tags = []
predicted_tags = []

# Optional: Track mismatches with sentence-level tags
mismatches = []

for i in range(len(label_ids)):
    true_sent = []
    pred_sent = []
    for true_id, pred_id in zip(label_ids[i], pred_labels[i]):
        if true_id != -100:
            true_tag = id2tag[true_id]
            pred_tag = id2tag[pred_id]
            true_tags.append(true_tag)
            predicted_tags.append(pred_tag)
            true_sent.append(true_tag)
            pred_sent.append(pred_tag)
    
    if true_sent != pred_sent:
        mismatches.append((i, true_sent, pred_sent))

# Step 5: Classification report
unique_tags = sorted(list(set(true_tags + predicted_tags)))
print("\n📊 Classification Report:")
print(classification_report(true_tags, predicted_tags, labels=unique_tags, zero_division=0))

# Step 6: Confusion matrix
print("\n📉 Confusion Matrix (label indices):")
tag2id_filtered = {tag: idx for idx, tag in enumerate(unique_tags)}
y_true_ids = [tag2id_filtered[tag] for tag in true_tags]
y_pred_ids = [tag2id_filtered[tag] for tag in predicted_tags]
cm = confusion_matrix(y_true_ids, y_pred_ids)
print(cm)

# Step 7: Show a few mismatched sentences
print(f"\n🔎 Total mismatches found: {len(mismatches)}")
for idx, true_sent, pred_sent in mismatches[:5]:  # Limit to 5 for readability
    print(f"\nMISMATCHED SENTENCE {idx + 1}")
    print("True Tags :", true_sent)
    print("Pred Tags :", pred_sent)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9433    0.9765    0.9596       426
         AUX     0.9228    0.9567    0.9394       300
          CC     0.9705    0.9578    0.9641       687
          CM     0.9429    0.9516    0.9472       434
         DET     0.9556    0.9597    0.9576      1165
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9991    1.0000    0.9996      2252
         JCV     0.6689    0.6527    0.6607       622
          JJ     0.8517    0.8139    0.8324      3557
         NCV     0.7161    0.8026    0.7569       927
         NDT     0.0000    0.0000    0.0000        14
         NIP     0.9728    0.9603    0.9665       857
          NN     0.0000    0.0000    0.0000         1
         NNC     0.9120    0.8966    0.9042     12213
         NNJ     0.6509    0.7770    0.7084      1296
         NNP     0.9302    0.9212    0.9257      5063
         NNp     0.0000    0.0000    0.0000         1
         NUM     0.9392    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9433    0.9765    0.9596       426
         AUX     0.9228    0.9567    0.9394       300
          CC     0.9705    0.9578    0.9641       687
          CM     0.9429    0.9516    0.9472       434
         DET     0.9556    0.9597    0.9576      1165
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9991    1.0000    0.9996      2252
         JCV     0.6689    0.6527    0.6607       622
          JJ     0.8517    0.8139    0.8324      3557
         NCV     0.7161    0.8026    0.7569       927
         NDT     0.0000    0.0000    0.0000        14
         NIP     0.9728    0.9603    0.9665       857
          NN     0.0000    0.0000    0.0000         1
         NNC     0.9120    0.8966    0.9042     12213
         NNJ     0.6509    0.7770    0.7084      1296
         NNP     0.9302    0.9212    0.9257      5063
         NNp     0.0000    0.0000    0.0000         1
         NUM     0.9392    

In [None]:
from collections import Counter
import pandas as pd
from sklearn.metrics import classification_report

# --- Step 1: Extract tags from datasets ---
train_tags = [id2tag[tag_id] for sent in tokenized_dataset["train"]["labels"] for tag_id in sent if tag_id != -100]
test_tags = [id2tag[tag_id] for sent in tokenized_dataset["test"]["labels"] for tag_id in sent if tag_id != -100]

train_counter = Counter(train_tags)
test_counter = Counter(test_tags)

train_tag_set = set(train_counter.keys())
test_tag_set = set(test_counter.keys())

both_tags = sorted(train_tag_set & test_tag_set)
train_only_tags = sorted(train_tag_set - test_tag_set)
test_only_tags = sorted(test_tag_set - train_tag_set)

# --- Step 2: Get classification report as dict ---
# Ensure these lists are aligned with predictions
report = classification_report(true_tags, predicted_tags, output_dict=True, zero_division=0)

# --- Step 3: Combine all tags into rows ---
rows = []

# Helper function to pull metrics safely
def get_metric(tag, metric):
    return report[tag][metric] if tag in report else 0.0

# Tags in both sets
for tag in both_tags:
    total = train_counter[tag] + test_counter[tag]
    rows.append({
        "Tag": tag,
        "Train Count": train_counter[tag],
        "Test Count": test_counter[tag],
        "Total Count": total,
        "Tag Origin": "Both",
        "Precision": get_metric(tag, "precision"),
        "Recall": get_metric(tag, "recall"),
        "F1-Score": get_metric(tag, "f1-score")
    })

# Tags only in training
for tag in train_only_tags:
    rows.append({
        "Tag": tag,
        "Train Count": train_counter[tag],
        "Test Count": 0,
        "Total Count": train_counter[tag],
        "Tag Origin": "Train Only",
        "Precision": 0.0,
        "Recall": 0.0,
        "F1-Score": 0.0
    })

# Tags only in testing
for tag in test_only_tags:
    rows.append({
        "Tag": tag,
        "Train Count": 0,
        "Test Count": test_counter[tag],
        "Total Count": test_counter[tag],
        "Tag Origin": "Test Only",
        "Precision": get_metric(tag, "precision"),
        "Recall": get_metric(tag, "recall"),
        "F1-Score": get_metric(tag, "f1-score")
    })

# --- Step 4: Build DataFrame and sort within each category ---
df = pd.DataFrame(rows)

# Sort within each group
df = df.sort_values(by=["Tag Origin", "Precision", "Total Count"], ascending=[True, False, False]).reset_index(drop=True)

# --- Optional: Display or export ---
pd.set_option("display.max_rows", None)  # to see all rows
print(df)

# Optional export
#df.to_csv("tag_analysis_num-of-ep-1.csv", index=False)


      Tag  Train Count  Test Count  Total Count  Tag Origin  Precision  \
0      FS         9010        2252        11262        Both   0.999113   
1    PUNC         7055        1838         8893        Both   0.993499   
2     PRP         5510        1376         6886        Both   0.974474   
3     NIP         3489         857         4346        Both   0.972813   
4      CC         2990         687         3677        Both   0.970501   
5     DET         4656        1165         5821        Both   0.955556   
6     ABB         1608         426         2034        Both   0.943311   
7      CM         1796         434         2230        Both   0.942922   
8     NUM         4280        1166         5446        Both   0.939167   
9      RP         4859        1275         6134        Both   0.937931   
10    NNP        20054        5063        25117        Both   0.930195   
11   POST        14754        3628        18382        Both   0.926974   
12    AUX         1260         300    

### Save and Use the Model

In [None]:
model.save_pretrained("sinhala-pos-xlm-r")
tokenizer.save_pretrained("sinhala-pos-xlm-r")


In [None]:
from transformers import pipeline

pos_pipeline = pipeline("token-classification", model="sinhala-pos-xlm-r", tokenizer="sinhala-pos-xlm-r", aggregation_strategy="simple")

sentence = "මම පාසැල යමි"
tokens = sentence.split()  # Assuming simple whitespace tokenization
print(pos_pipeline(tokens))


In [None]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA device count: 1
GPU name: NVIDIA GeForce GTX 1050 Ti
