### Prepare Dataset

In [1]:
def load_pos_data(path):
    dataset = []
    with open(path, "r", encoding="utf-8") as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                # End of a sentence
                if tokens:
                    dataset.append({"tokens": tokens, "tags": tags})
                    tokens = []
                    tags = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    word, tag = parts
                    tokens.append(word)
                    tags.append(tag)
        # Catch the last sentence if no newline at EOF
        if tokens:
            dataset.append({"tokens": tokens, "tags": tags})
    return dataset


In [2]:
print(load_pos_data("sinhala_pos.txt")[0:2])

[{'tokens': ['ඊශ්රායල්', 'මිසයිල', 'ප්රහාර', 'වලින්', 'පලස්තීනුවෝ', '4', 'ක්', 'මිය', 'යති', '.'], 'tags': ['NNP', 'NNJ', 'NNC', 'CM', 'NNP', 'NUM', 'RP', 'RRPCV', 'VFM', 'FS']}, {'tokens': ['ගාසා', 'තීරයේදී', '.'], 'tags': ['NNP', 'NNP', 'FS']}]


In [3]:
from datasets import Dataset, DatasetDict
import random

# all_data = load_pos_data("sinhala_pos.txt")
# random.shuffle(all_data)

# # Optional: 80% train, 20% test split
# split_idx = int(0.8 * len(all_data))
# train_data = all_data[:split_idx]
# test_data = all_data[split_idx:]

# dataset = DatasetDict({
#     "train": Dataset.from_list(train_data),
#     "test": Dataset.from_list(test_data),
# })

data = load_pos_data("sinhala_pos.txt")

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2)


  from .autonotebook import tqdm as notebook_tqdm


### tag2id mapping

In [4]:
unique_tags = set(tag for example in data for tag in example["tags"])

tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
label_list = list(tag2id.keys())

id2tag = {i: tag for tag, i in tag2id.items()}


### Tokenize and Align Labels

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"], 
        is_split_into_words=True, 
        truncation=True, 
        padding="max_length",      # Pad to max length of the model or your max_length param
        max_length=256,            # or any max_length you want (optional)
        return_tensors=None        # don't convert to tensors here; Trainer does it later
    )
    
    word_ids = tokenized.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(tag2id[example["tags"][word_idx]])
        else:
            # Label only the first sub-token
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)


Map: 100%|██████████| 9040/9040 [00:11<00:00, 777.38 examples/s] 
Map: 100%|██████████| 2261/2261 [00:02<00:00, 1022.79 examples/s]


### Define the Model

In [6]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id,
    local_files_only=True
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train the Model

In [8]:
from transformers import TrainingArguments, Trainer
import numpy as np
# from seqeval.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import accuracy_score, f1_score, classification_report

# def compute_metrics(p):
#     preds = np.argmax(p.predictions, axis=2)
#     labels = p.label_ids

#     true_preds = [
#         [id2tag[p] for (p, l) in zip(pred_seq, label_seq) if l != -100]
#         for pred_seq, label_seq in zip(preds, labels)
#     ]
#     true_labels = [
#         [id2tag[l] for (p, l) in zip(pred_seq, label_seq) if l != -100]
#         for pred_seq, label_seq in zip(preds, labels)
#     ]

#     return {
#         "accuracy": accuracy_score(true_labels, true_preds),
#         "report": classification_report(true_labels, true_preds),
#     }

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    # Convert IDs to tag names, skip padding (-100)

    
    true_preds = [
        id2tag[p] for pred_seq, label_seq in zip(preds, labels)
        for p, l in zip(pred_seq, label_seq) if l != -100
    ]
    true_labels = [
        id2tag[l] for pred_seq, label_seq in zip(preds, labels)
        for p, l in zip(pred_seq, label_seq) if l != -100
    ]

    # Optional: print detailed report to console (not return)
    print(classification_report(true_labels, true_preds, digits=4))

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds, average="weighted"),
        # Avoid printing report here in returned dict — it's too long
        # Use print manually if needed:
        # print(classification_report(true_labels, true_preds))
    }

training_args = TrainingArguments(
    output_dir="./pos-xlm-r",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3046,0.305525,0.91525,0.91538


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9451    0.9663    0.9556       445
         ACV     0.0000    0.0000    0.0000         1
         AUX     0.9760    0.9661    0.9710       295
          CC     0.9757    0.9525    0.9640       716
          CM     0.9581    0.9289    0.9433       394
         DET     0.9570    0.9684    0.9626      1171
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9996    1.0000    0.9998      2253
         JCV     0.6865    0.7586    0.7208       638
          JJ     0.8618    0.8221    0.8415      3542
         NCV     0.7689    0.8263    0.7966       898
         NDT     0.0000    0.0000    0.0000        16
         NIP     0.9563    0.9698    0.9630       926
         NNC     0.9173    0.9111    0.9142     12326
         NNJ     0.6959    0.8012    0.7448      1348
         NNP     0.9420    0.9238    0.9328      5134
         NUM     0.9412    0.9634    0.9521      1146
         NVB     0.6826    

TrainOutput(global_step=565, training_loss=0.3061565297894773, metrics={'train_runtime': 9348.4442, 'train_samples_per_second': 0.967, 'train_steps_per_second': 0.06, 'total_flos': 1181488456581120.0, 'train_loss': 0.3061565297894773, 'epoch': 1.0})

### Evaluate

In [9]:
trainer.evaluate()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9451    0.9663    0.9556       445
         ACV     0.0000    0.0000    0.0000         1
         AUX     0.9760    0.9661    0.9710       295
          CC     0.9757    0.9525    0.9640       716
          CM     0.9581    0.9289    0.9433       394
         DET     0.9570    0.9684    0.9626      1171
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9996    1.0000    0.9998      2253
         JCV     0.6865    0.7586    0.7208       638
          JJ     0.8618    0.8221    0.8415      3542
         NCV     0.7689    0.8263    0.7966       898
         NDT     0.0000    0.0000    0.0000        16
         NIP     0.9563    0.9698    0.9630       926
         NNC     0.9173    0.9111    0.9142     12326
         NNJ     0.6959    0.8012    0.7448      1348
         NNP     0.9420    0.9238    0.9328      5134
         NUM     0.9412    0.9634    0.9521      1146
         NVB     0.6826    

{'eval_loss': 0.30552545189857483,
 'eval_accuracy': 0.9152501312229984,
 'eval_f1': 0.91537969480362,
 'eval_runtime': 123.4862,
 'eval_samples_per_second': 18.31,
 'eval_steps_per_second': 1.15,
 'epoch': 1.0}

In [12]:
from sklearn.metrics import classification_report

# Get predictions
predictions_output = trainer.predict(tokenized_dataset["test"])
preds = predictions_output.predictions.argmax(-1)  
labels = predictions_output.label_ids             

# Flatten lists but skip -100
true_labels = []
predicted_labels = []

for pred_seq, label_seq in zip(preds, labels):
    for pred, label in zip(pred_seq, label_seq):
        if label != -100:
            true_labels.append(label)
            predicted_labels.append(pred)

# Ensure true_labels and predicted_labels are flat lists
assert isinstance(true_labels, list) and isinstance(predicted_labels, list), "Labels must be flat lists."

# Evaluate with sklearn using id2tag for label names
target_names = [id2tag[i] for i in sorted(id2tag.keys())]
print(classification_report(true_labels, predicted_labels, target_names=target_names))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9451    0.9663    0.9556       445
         ACV     0.0000    0.0000    0.0000         1
         AUX     0.9760    0.9661    0.9710       295
          CC     0.9757    0.9525    0.9640       716
          CM     0.9581    0.9289    0.9433       394
         DET     0.9570    0.9684    0.9626      1171
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9996    1.0000    0.9998      2253
         JCV     0.6865    0.7586    0.7208       638
          JJ     0.8618    0.8221    0.8415      3542
         NCV     0.7689    0.8263    0.7966       898
         NDT     0.0000    0.0000    0.0000        16
         NIP     0.9563    0.9698    0.9630       926
         NNC     0.9173    0.9111    0.9142     12326
         NNJ     0.6959    0.8012    0.7448      1348
         NNP     0.9420    0.9238    0.9328      5134
         NUM     0.9412    0.9634    0.9521      1146
         NVB     0.6826    

ValueError: Number of classes, 33, does not match size of target_names, 42. Try specifying the labels parameter

In [15]:
from sklearn.metrics import classification_report

# Get predictions
predictions_output = trainer.predict(tokenized_dataset["test"])
preds = predictions_output.predictions.argmax(-1)
labels = predictions_output.label_ids

# Flatten predictions, ignoring -100
true_labels = []
predicted_labels = []

for pred_seq, label_seq in zip(preds, labels):
    for pred, label in zip(pred_seq, label_seq):
        if label != -100:
            true_labels.append(label)
            predicted_labels.append(pred)

# Dynamically detect used label IDs
used_label_ids = sorted(set(true_labels + predicted_labels))

# Match label IDs to tag names
target_names = [id2tag[i] for i in used_label_ids]

# Evaluate safely with matching label set
print(classification_report(
    true_labels,
    predicted_labels,
    labels=used_label_ids,
    target_names=target_names,
    zero_division=0  # prevents warnings for unseen labels
))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9451    0.9663    0.9556       445
         ACV     0.0000    0.0000    0.0000         1
         AUX     0.9760    0.9661    0.9710       295
          CC     0.9757    0.9525    0.9640       716
          CM     0.9581    0.9289    0.9433       394
         DET     0.9570    0.9684    0.9626      1171
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9996    1.0000    0.9998      2253
         JCV     0.6865    0.7586    0.7208       638
          JJ     0.8618    0.8221    0.8415      3542
         NCV     0.7689    0.8263    0.7966       898
         NDT     0.0000    0.0000    0.0000        16
         NIP     0.9563    0.9698    0.9630       926
         NNC     0.9173    0.9111    0.9142     12326
         NNJ     0.6959    0.8012    0.7448      1348
         NNP     0.9420    0.9238    0.9328      5134
         NUM     0.9412    0.9634    0.9521      1146
         NVB     0.6826    

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Step 1: Run predictions
predictions_output = trainer.predict(tokenized_dataset["test"])
predictions = predictions_output.predictions
label_ids = predictions_output.label_ids

# Step 2: Convert predictions to labels
pred_labels = np.argmax(predictions, axis=2)

# Step 3: Convert to flat lists while skipping -100
true_labels = []
predicted_labels = []

for i in range(len(label_ids)):
    for j in range(len(label_ids[i])):
        if label_ids[i][j] != -100:
            true_labels.append(label_ids[i][j])
            predicted_labels.append(pred_labels[i][j])

# Step 4: Print classification report
id2label = {v: k for k, v in label2id.items()}
target_names = [id2label[i] for i in sorted(id2label)]

print("Classification Report:\n")
print(classification_report(true_labels, predicted_labels, target_names=target_names))

# Optional: Print confusion matrix
print("Confusion Matrix:\n")
print(confusion_matrix(true_labels, predicted_labels))



ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [14]:
# Evaluate the model on the test set
eval_results = trainer.evaluate()

print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

# Get predictions on the test set
predictions_output = trainer.predict(tokenized_dataset["test"])

predictions = predictions_output.predictions
label_ids = predictions_output.label_ids

# Get the most likely label for each token
pred_labels = np.argmax(predictions, axis=2)

# Assuming you have a list like id2tag = {0: "NN", 1: "VB", ...}
id2tag = {i: tag for tag, i in tag2id.items()}

for i in range(len(pred_labels)):
    true_sent = []
    pred_sent = []
    for true_id, pred_id, input_id in zip(label_ids[i], pred_labels[i], tokenized_dataset["test"]["input_ids"][i]):
        if true_id != -100:
            true_sent.append(id2tag[true_id])
            pred_sent.append(id2tag[pred_id])
    
    if true_sent != pred_sent:
        print(f"\nMISMATCHED SENTENCE {i+1}")
        print("True :", true_sent)
        print("Pred :", pred_sent)

from sklearn.metrics import confusion_matrix, classification_report

true_flat = []
pred_flat = []

for i in range(len(label_ids)):
    for true_id, pred_id in zip(label_ids[i], pred_labels[i]):
        if true_id != -100:
            true_flat.append(true_id)
            pred_flat.append(pred_id)

# Confusion matrix
cm = confusion_matrix(true_flat, pred_flat)
print("Confusion Matrix:")
print(cm)

# Classification report
print("\nClassification Report:")
print(classification_report(true_flat, pred_flat, target_names=[id2tag[i] for i in range(len(id2tag))]))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9451    0.9663    0.9556       445
         ACV     0.0000    0.0000    0.0000         1
         AUX     0.9760    0.9661    0.9710       295
          CC     0.9757    0.9525    0.9640       716
          CM     0.9581    0.9289    0.9433       394
         DET     0.9570    0.9684    0.9626      1171
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9996    1.0000    0.9998      2253
         JCV     0.6865    0.7586    0.7208       638
          JJ     0.8618    0.8221    0.8415      3542
         NCV     0.7689    0.8263    0.7966       898
         NDT     0.0000    0.0000    0.0000        16
         NIP     0.9563    0.9698    0.9630       926
         NNC     0.9173    0.9111    0.9142     12326
         NNJ     0.6959    0.8012    0.7448      1348
         NNP     0.9420    0.9238    0.9328      5134
         NUM     0.9412    0.9634    0.9521      1146
         NVB     0.6826    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         ABB     0.9451    0.9663    0.9556       445
         ACV     0.0000    0.0000    0.0000         1
         AUX     0.9760    0.9661    0.9710       295
          CC     0.9757    0.9525    0.9640       716
          CM     0.9581    0.9289    0.9433       394
         DET     0.9570    0.9684    0.9626      1171
         FRW     0.0000    0.0000    0.0000         1
          FS     0.9996    1.0000    0.9998      2253
         JCV     0.6865    0.7586    0.7208       638
          JJ     0.8618    0.8221    0.8415      3542
         NCV     0.7689    0.8263    0.7966       898
         NDT     0.0000    0.0000    0.0000        16
         NIP     0.9563    0.9698    0.9630       926
         NNC     0.9173    0.9111    0.9142     12326
         NNJ     0.6959    0.8012    0.7448      1348
         NNP     0.9420    0.9238    0.9328      5134
         NUM     0.9412    0.9634    0.9521      1146
         NVB     0.6826    

ValueError: Number of classes, 33, does not match size of target_names, 42. Try specifying the labels parameter

### Save and Use the Model

In [None]:
model.save_pretrained("sinhala-pos-xlm-r")
tokenizer.save_pretrained("sinhala-pos-xlm-r")


In [None]:
from transformers import pipeline

pos_pipeline = pipeline("token-classification", model="sinhala-pos-xlm-r", tokenizer="sinhala-pos-xlm-r", aggregation_strategy="simple")

sentence = "මම පාසැල යමි"
tokens = sentence.split()  # Assuming simple whitespace tokenization
print(pos_pipeline(tokens))


In [None]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA device count: 1
GPU name: NVIDIA GeForce GTX 1050 Ti
