### Prepare Dataset

In [8]:
def load_pos_data(path):
    dataset = []
    with open(path, "r", encoding="utf-8") as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                # End of a sentence
                if tokens:
                    dataset.append({"tokens": tokens, "tags": tags})
                    tokens = []
                    tags = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    word, tag = parts
                    tokens.append(word)
                    tags.append(tag)
        # Catch the last sentence if no newline at EOF
        if tokens:
            dataset.append({"tokens": tokens, "tags": tags})
    return dataset


In [9]:
print(load_pos_data("sinhala_pos.txt")[0:2])

[{'tokens': ['ඊශ්රායල්', 'මිසයිල', 'ප්රහාර', 'වලින්', 'පලස්තීනුවෝ', '4', 'ක්', 'මිය', 'යති', '.'], 'tags': ['NNP', 'NNJ', 'NNC', 'CM', 'NNP', 'NUM', 'RP', 'RRPCV', 'VFM', 'FS']}, {'tokens': ['ගාසා', 'තීරයේදී', '.'], 'tags': ['NNP', 'NNP', 'FS']}]


In [10]:
from datasets import Dataset, DatasetDict
import random

# all_data = load_pos_data("sinhala_pos.txt")
# random.shuffle(all_data)

# # Optional: 80% train, 20% test split
# split_idx = int(0.8 * len(all_data))
# train_data = all_data[:split_idx]
# test_data = all_data[split_idx:]

# dataset = DatasetDict({
#     "train": Dataset.from_list(train_data),
#     "test": Dataset.from_list(test_data),
# })

data = load_pos_data("sinhala_pos.txt")

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2)


### tag2id mapping

In [11]:
unique_tags = set(tag for example in data for tag in example["tags"])
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}


### Tokenize and Align Labels

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"], 
        is_split_into_words=True, 
        truncation=True, 
        padding="max_length",      # Pad to max length of the model or your max_length param
        max_length=128,            # or any max_length you want (optional)
        return_tensors=None        # don't convert to tensors here; Trainer does it later
    )
    
    word_ids = tokenized.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(tag2id[example["tags"][word_idx]])
        else:
            # Label only the first sub-token
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)


Map: 100%|██████████| 9040/9040 [00:04<00:00, 2227.21 examples/s]
Map: 100%|██████████| 2261/2261 [00:01<00:00, 2159.21 examples/s]


### Define the Model

In [13]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id,
    local_files_only=True
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train the Model

In [14]:
from transformers import TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_preds = [
        [id2tag[p] for (p, l) in zip(pred_seq, label_seq) if l != -100]
        for pred_seq, label_seq in zip(preds, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(pred_seq, label_seq) if l != -100]
        for pred_seq, label_seq in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "report": classification_report(true_labels, true_preds),
    }

training_args = TrainingArguments(
    output_dir="./pos-xlm-r",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Report
1,0.3541,0.315354,0.913286,precision recall f1-score support  B 0.84 0.82 0.83 457  BB 0.96 0.96 0.96 365  BE 0.51 0.72 0.60 39  C 0.96 0.97 0.96 767  CV 0.83 0.86 0.84 1573  DT 0.00 0.00 0.00 23  ET 0.93 0.97 0.95 1140  FM 0.94 0.95 0.94 1264  IP 0.96 0.97 0.97 858  J 0.80 0.80 0.80 3108  M 0.97 0.95 0.96 478  NC 0.84 0.86 0.85 10158  NF 0.89 0.91 0.90 2282  NF[ 0.00 0.00 0.00 1  NJ 0.66 0.73 0.70 1215  NK 0.97 0.69 0.81 45  NN 0.90 0.90 0.90 1339  NP 0.89 0.83 0.86 3703  Np 0.00 0.00 0.00 1  OST 0.93 0.95 0.94 3508  P 0.92 0.92 0.92 4468  RP 0.96 0.94 0.95 1286  RPCV 0.91 0.86 0.88 783  S 1.00 1.00 1.00 2222  UE 0.00 0.00 0.00 20  UM 0.94 0.94 0.94 1017  UNC 0.99 1.00 0.99 1712  UX 0.98 0.98 0.98 292  VB 0.76 0.60 0.67 161  VF 0.00 0.00 0.00 1  micro avg 0.89 0.89 0.89 44286  macro avg 0.74 0.74 0.74 44286 weighted avg 0.89 0.89 0.89 44286
2,0.278,0.286433,0.919637,precision recall f1-score support  B 0.86 0.83 0.84 457  BB 0.95 0.98 0.96 365  BE 0.58 0.87 0.69 39  C 0.96 0.97 0.96 767  CV 0.80 0.88 0.84 1573  DT 0.81 0.57 0.67 23  ET 0.97 0.96 0.97 1140  FM 0.96 0.93 0.94 1264  IP 0.95 0.98 0.96 858  J 0.80 0.82 0.81 3108  M 0.96 0.95 0.96 478  NC 0.86 0.86 0.86 10158  NF 0.90 0.91 0.90 2282  NF[ 0.00 0.00 0.00 1  NJ 0.72 0.73 0.72 1215  NK 1.00 0.84 0.92 45  NN 0.85 0.91 0.88 1339  NP 0.89 0.89 0.89 3703  Np 0.00 0.00 0.00 1  OST 0.94 0.94 0.94 3508  P 0.92 0.94 0.93 4468  RP 0.96 0.94 0.95 1286  RPCV 0.89 0.89 0.89 783  S 1.00 1.00 1.00 2222  UE 1.00 0.25 0.40 20  UM 0.93 0.95 0.94 1017  UNC 1.00 1.00 1.00 1712  UX 0.97 0.99 0.98 292  VB 0.73 0.80 0.76 161  VF 0.00 0.00 0.00 1  micro avg 0.89 0.90 0.90 44286  macro avg 0.80 0.79 0.79 44286 weighted avg 0.90 0.90 0.90 44286
3,0.2291,0.270034,0.922792,precision recall f1-score support  B 0.83 0.85 0.84 457  BB 0.96 0.98 0.97 365  BE 0.52 0.85 0.65 39  C 0.97 0.97 0.97 767  CV 0.85 0.84 0.84 1573  DT 0.64 0.78 0.71 23  ET 0.98 0.96 0.97 1140  FM 0.94 0.93 0.94 1264  IP 0.96 0.97 0.96 858  J 0.83 0.81 0.82 3108  M 0.96 0.95 0.96 478  NC 0.86 0.87 0.86 10158  NF 0.93 0.90 0.91 2282  NF[ 0.00 0.00 0.00 1  NJ 0.67 0.78 0.72 1215  NK 0.97 0.80 0.88 45  NN 0.89 0.91 0.90 1339  NP 0.90 0.89 0.90 3703  Np 0.00 0.00 0.00 1  OST 0.94 0.95 0.94 3508  P 0.92 0.94 0.93 4468  RP 0.97 0.93 0.95 1286  RPCV 0.88 0.89 0.88 783  S 1.00 1.00 1.00 2222  UE 0.50 0.05 0.09 20  UM 0.95 0.93 0.94 1017  UNC 1.00 1.00 1.00 1712  UX 0.97 0.99 0.98 292  VB 0.72 0.72 0.72 161  VF 0.00 0.00 0.00 1  micro avg 0.90 0.90 0.90 44286  macro avg 0.78 0.78 0.77 44286 weighted avg 0.90 0.90 0.90 44286
4,0.1836,0.278556,0.922527,precision recall f1-score support  B 0.77 0.89 0.82 457  BB 0.95 0.98 0.97 365  BE 0.57 0.85 0.68 39  C 0.97 0.97 0.97 767  CV 0.86 0.83 0.84 1573  DT 0.69 0.78 0.73 23  ET 0.98 0.96 0.97 1140  FM 0.94 0.95 0.94 1264  IP 0.96 0.98 0.97 858  J 0.83 0.81 0.82 3108  M 0.97 0.95 0.96 478  NC 0.86 0.87 0.86 10158  NF 0.90 0.91 0.90 2282  NF[ 0.00 0.00 0.00 1  NJ 0.69 0.76 0.72 1215  NK 0.90 0.80 0.85 45  NN 0.90 0.91 0.90 1339  NP 0.90 0.89 0.90 3703  Np 0.00 0.00 0.00 1  OST 0.95 0.94 0.94 3508  P 0.94 0.93 0.93 4468  RP 0.94 0.94 0.94 1286  RPCV 0.84 0.91 0.88 783  S 1.00 1.00 1.00 2222  UE 1.00 0.15 0.26 20  UM 0.96 0.92 0.94 1017  UNC 1.00 1.00 1.00 1712  UX 0.97 0.99 0.98 292  VB 0.70 0.78 0.74 161  VF 0.00 0.00 0.00 1  micro avg 0.90 0.90 0.90 44286  macro avg 0.80 0.79 0.78 44286 weighted avg 0.90 0.90 0.90 44286
5,0.1443,0.293396,0.922446,precision recall f1-score support  B 0.85 0.86 0.85 457  BB 0.96 0.98 0.97 365  BE 0.69 0.85 0.76 39  C 0.95 0.97 0.96 767  CV 0.81 0.88 0.84 1573  DT 0.86 0.52 0.65 23  ET 0.97 0.96 0.97 1140  FM 0.94 0.94 0.94 1264  IP 0.96 0.98 0.97 858  J 0.85 0.79 0.82 3108  M 0.96 0.95 0.96 478  NC 0.86 0.86 0.86 10158  NF 0.91 0.90 0.90 2282  NF[ 0.00 0.00 0.00 1  NJ 0.71 0.75 0.73 1215  NK 0.90 0.80 0.85 45  NN 0.85 0.93 0.89 1339  NP 0.88 0.91 0.90 3703  Np 0.00 0.00 0.00 1  OST 0.94 0.94 0.94 3508  P 0.94 0.93 0.93 4468  RP 0.96 0.94 0.95 1286  RPCV 0.87 0.88 0.88 783  S 1.00 1.00 1.00 2222  UE 0.64 0.35 0.45 20  UM 0.96 0.93 0.94 1017  UNC 1.00 1.00 1.00 1712  UX 0.97 0.99 0.98 292  VB 0.72 0.73 0.72 161  VF 0.00 0.00 0.00 1  micro avg 0.90 0.90 0.90 44286  macro avg 0.80 0.78 0.79 44286 weighted avg 0.90 0.90 0.90 44286
6,0.116,0.313326,0.923504,precision recall f1-score support  B 0.85 0.85 0.85 457  BB 0.96 0.98 0.97 365  BE 0.69 0.85 0.76 39  C 0.96 0.97 0.96 767  CV 0.85 0.83 0.84 1573  DT 0.82 0.61 0.70 23  ET 0.97 0.96 0.97 1140  FM 0.93 0.94 0.94 1264  IP 0.96 0.97 0.97 858  J 0.81 0.82 0.82 3108  M 0.96 0.95 0.95 478  NC 0.86 0.87 0.86 10158  NF 0.91 0.91 0.91 2282  NF[ 0.00 0.00 0.00 1  NJ 0.70 0.76 0.73 1215  NK 0.90 0.80 0.85 45  NN 0.90 0.91 0.91 1339  NP 0.91 0.90 0.90 3703  Np 0.00 0.00 0.00 1  OST 0.94 0.94 0.94 3508  P 0.93 0.93 0.93 4468  RP 0.94 0.95 0.94 1286  RPCV 0.87 0.90 0.88 783  S 1.00 1.00 1.00 2222  UE 0.64 0.35 0.45 20  UM 0.95 0.94 0.94 1017  UNC 1.00 1.00 1.00 1712  UX 0.96 0.99 0.98 292  VB 0.76 0.74 0.75 161  VF 0.00 0.00 0.00 1  micro avg 0.90 0.90 0.90 44286  macro avg 0.80 0.79 0.79 44286 weighted avg 0.90 0.90 0.90 44286
7,0.0866,0.337023,0.924135,precision recall f1-score support  B 0.82 0.88 0.85 457  BB 0.96 0.98 0.97 365  BE 0.68 0.87 0.76 39  C 0.95 0.97 0.96 767  CV 0.83 0.85 0.84 1573  DT 0.77 0.74 0.76 23  ET 0.97 0.96 0.97 1140  FM 0.94 0.94 0.94 1264  IP 0.96 0.98 0.97 858  J 0.83 0.82 0.83 3108  M 0.97 0.94 0.95 478  NC 0.86 0.87 0.87 10158  NF 0.91 0.91 0.91 2282  NF[ 0.00 0.00 0.00 1  NJ 0.72 0.76 0.74 1215  NK 0.90 0.80 0.85 45  NN 0.91 0.92 0.91 1339  NP 0.90 0.90 0.90 3703  Np 0.00 0.00 0.00 1  OST 0.94 0.94 0.94 3508  P 0.93 0.93 0.93 4468  RP 0.95 0.94 0.95 1286  RPCV 0.89 0.89 0.89 783  S 1.00 1.00 1.00 2222  UE 0.58 0.35 0.44 20  UM 0.95 0.94 0.94 1017  UNC 1.00 1.00 1.00 1712  UX 0.97 0.99 0.98 292  VB 0.70 0.75 0.73 161  VF 0.00 0.00 0.00 1  micro avg 0.90 0.91 0.90 44286  macro avg 0.79 0.79 0.79 44286 weighted avg 0.90 0.91 0.90 44286
8,0.0655,0.358419,0.925296,precision recall f1-score support  B 0.81 0.87 0.84 457  BB 0.96 0.98 0.97 365  BE 0.67 0.85 0.75 39  C 0.96 0.97 0.96 767  CV 0.83 0.87 0.85 1573  DT 0.79 0.65 0.71 23  ET 0.97 0.97 0.97 1140  FM 0.94 0.94 0.94 1264  IP 0.96 0.98 0.97 858  J 0.82 0.82 0.82 3108  M 0.96 0.94 0.95 478  NC 0.87 0.87 0.87 10158  NF 0.91 0.91 0.91 2282  NF[ 0.00 0.00 0.00 1  NJ 0.74 0.74 0.74 1215  NK 0.90 0.80 0.85 45  NN 0.90 0.92 0.91 1339  NP 0.90 0.90 0.90 3703  Np 0.00 0.00 0.00 1  OST 0.94 0.95 0.94 3508  P 0.94 0.93 0.93 4468  RP 0.95 0.95 0.95 1286  RPCV 0.90 0.88 0.89 783  S 1.00 1.00 1.00 2222  UE 0.50 0.30 0.37 20  UM 0.95 0.94 0.95 1017  UNC 1.00 1.00 1.00 1712  UX 0.96 0.99 0.97 292  VB 0.72 0.75 0.74 161  VF 0.00 0.00 0.00 1  micro avg 0.90 0.91 0.90 44286  macro avg 0.79 0.79 0.79 44286 weighted avg 0.90 0.91 0.90 44286
9,0.0467,0.38415,0.925235,precision recall f1-score support  B 0.81 0.87 0.84 457  BB 0.96 0.98 0.97 365  BE 0.67 0.85 0.75 39  C 0.96 0.96 0.96 767  CV 0.82 0.87 0.84 1573  DT 0.77 0.74 0.76 23  ET 0.97 0.96 0.97 1140  FM 0.95 0.94 0.94 1264  IP 0.96 0.98 0.97 858  J 0.82 0.82 0.82 3108  M 0.96 0.94 0.95 478  NC 0.87 0.87 0.87 10158  NF 0.92 0.91 0.91 2282  NF[ 0.00 0.00 0.00 1  NJ 0.72 0.75 0.73 1215  NK 0.90 0.80 0.85 45  NN 0.89 0.93 0.91 1339  NP 0.90 0.90 0.90 3703  Np 0.00 0.00 0.00 1  OST 0.94 0.95 0.94 3508  P 0.93 0.94 0.93 4468  RP 0.95 0.94 0.95 1286  RPCV 0.90 0.88 0.89 783  S 1.00 1.00 1.00 2222  UE 0.55 0.30 0.39 20  UM 0.94 0.94 0.94 1017  UNC 1.00 1.00 1.00 1712  UX 0.97 0.99 0.98 292  VB 0.76 0.72 0.74 161  VF 0.00 0.00 0.00 1  micro avg 0.90 0.91 0.91 44286  macro avg 0.79 0.79 0.79 44286 weighted avg 0.90 0.91 0.91 44286
10,0.0378,0.394826,0.925805,precision recall f1-score support  B 0.82 0.87 0.85 457  BB 0.96 0.98 0.97 365  BE 0.67 0.85 0.75 39  C 0.96 0.97 0.96 767  CV 0.83 0.86 0.85 1573  DT 0.68 0.83 0.75 23  ET 0.98 0.96 0.97 1140  FM 0.94 0.95 0.94 1264  IP 0.96 0.98 0.97 858  J 0.83 0.82 0.82 3108  M 0.96 0.94 0.95 478  NC 0.87 0.87 0.87 10158  NF 0.92 0.91 0.91 2282  NF[ 0.00 0.00 0.00 1  NJ 0.71 0.76 0.73 1215  NK 0.90 0.80 0.85 45  NN 0.89 0.92 0.91 1339  NP 0.90 0.90 0.90 3703  Np 0.00 0.00 0.00 1  OST 0.94 0.95 0.94 3508  P 0.93 0.93 0.93 4468  RP 0.95 0.94 0.95 1286  RPCV 0.90 0.89 0.89 783  S 1.00 1.00 1.00 2222  UE 0.55 0.30 0.39 20  UM 0.95 0.94 0.94 1017  UNC 1.00 1.00 1.00 1712  UX 0.97 0.99 0.98 292  VB 0.79 0.76 0.77 161  VF 0.00 0.00 0.00 1  micro avg 0.90 0.91 0.91 44286  macro avg 0.79 0.80 0.79 44286 weighted avg 0.90 0.91 0.91 44286


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=11300, training_loss=0.1673330599016848, metrics={'train_runtime': 7805.7296, 'train_samples_per_second': 11.581, 'train_steps_per_second': 1.448, 'total_flos': 5907442282905600.0, 'train_loss': 0.1673330599016848, 'epoch': 10.0})

### Evaluate

In [15]:
trainer.evaluate()


{'eval_loss': 0.3948262631893158,
 'eval_accuracy': 0.9258045473975615,
 'eval_report': '              precision    recall  f1-score   support\n\n           B       0.82      0.87      0.85       457\n          BB       0.96      0.98      0.97       365\n          BE       0.67      0.85      0.75        39\n           C       0.96      0.97      0.96       767\n          CV       0.83      0.86      0.85      1573\n          DT       0.68      0.83      0.75        23\n          ET       0.98      0.96      0.97      1140\n          FM       0.94      0.95      0.94      1264\n          IP       0.96      0.98      0.97       858\n           J       0.83      0.82      0.82      3108\n           M       0.96      0.94      0.95       478\n          NC       0.87      0.87      0.87     10158\n          NF       0.92      0.91      0.91      2282\n         NF[       0.00      0.00      0.00         1\n          NJ       0.71      0.76      0.73      1215\n          NK       0.90      

### Save and Use the Model

In [16]:
model.save_pretrained("sinhala-pos-xlm-r")
tokenizer.save_pretrained("sinhala-pos-xlm-r")


('sinhala-pos-xlm-r\\tokenizer_config.json',
 'sinhala-pos-xlm-r\\special_tokens_map.json',
 'sinhala-pos-xlm-r\\tokenizer.json')

In [17]:
from transformers import pipeline

pos_pipeline = pipeline("token-classification", model="sinhala-pos-xlm-r", tokenizer="sinhala-pos-xlm-r", aggregation_strategy="simple")

sentence = "මම පාසැල යමි"
tokens = sentence.split()  # Assuming simple whitespace tokenization
print(pos_pipeline(tokens))


Device set to use cuda:0


[[{'entity_group': 'PRP', 'score': np.float32(0.99833137), 'word': 'මම', 'start': 0, 'end': 2}], [{'entity_group': 'NNC', 'score': np.float32(0.99576), 'word': 'පාසැල', 'start': 0, 'end': 5}], [{'entity_group': 'VFM', 'score': np.float32(0.9989997), 'word': 'යමි', 'start': 0, 'end': 3}]]


In [18]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA device count: 1
GPU name: NVIDIA GeForce RTX 2060
