### Prepare Dataset

In [1]:
def load_pos_data(path):
    dataset = []
    with open(path, "r", encoding="utf-8") as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                # End of a sentence
                if tokens:
                    dataset.append({"tokens": tokens, "tags": tags})
                    tokens = []
                    tags = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    word, tag = parts
                    tokens.append(word)
                    tags.append(tag)
        # Catch the last sentence if no newline at EOF
        if tokens:
            dataset.append({"tokens": tokens, "tags": tags})
    return dataset


In [2]:
print(load_pos_data("sinhala_pos.txt")[0:2])

[{'tokens': ['ඊශ්රායල්', 'මිසයිල', 'ප්රහාර', 'වලින්', 'පලස්තීනුවෝ', '4', 'ක්', 'මිය', 'යති', '.'], 'tags': ['NNP', 'NNJ', 'NNC', 'CM', 'NNP', 'NUM', 'RP', 'RRPCV', 'VFM', 'FS']}, {'tokens': ['ගාසා', 'තීරයේදී', '.'], 'tags': ['NNP', 'NNP', 'FS']}]


In [3]:
from datasets import Dataset, DatasetDict
import random

# all_data = load_pos_data("sinhala_pos.txt")
# random.shuffle(all_data)

# # Optional: 80% train, 20% test split
# split_idx = int(0.8 * len(all_data))
# train_data = all_data[:split_idx]
# test_data = all_data[split_idx:]

# dataset = DatasetDict({
#     "train": Dataset.from_list(train_data),
#     "test": Dataset.from_list(test_data),
# })

data = load_pos_data("sinhala_pos.txt")

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2)


  from .autonotebook import tqdm as notebook_tqdm


### tag2id mapping

In [4]:
unique_tags = set(tag for example in data for tag in example["tags"])
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}


### Tokenize and Align Labels

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"], 
        is_split_into_words=True, 
        truncation=True, 
        padding="max_length",      # Pad to max length of the model or your max_length param
        max_length=128,            # or any max_length you want (optional)
        return_tensors=None        # don't convert to tensors here; Trainer does it later
    )
    
    word_ids = tokenized.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(tag2id[example["tags"][word_idx]])
        else:
            # Label only the first sub-token
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)


Map: 100%|██████████| 9040/9040 [00:08<00:00, 1052.50 examples/s]
Map: 100%|██████████| 2261/2261 [00:01<00:00, 1271.48 examples/s]


### Define the Model

In [6]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id,
    local_files_only=True
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train the Model

In [7]:
from transformers import TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_preds = [
        [id2tag[p] for (p, l) in zip(pred_seq, label_seq) if l != -100]
        for pred_seq, label_seq in zip(preds, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(pred_seq, label_seq) if l != -100]
        for pred_seq, label_seq in zip(preds, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "report": classification_report(true_labels, true_preds),
    }

training_args = TrainingArguments(
    output_dir="./pos-xlm-r",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

### Evaluate

In [None]:
trainer.evaluate()


### Save and Use the Model

In [None]:
model.save_pretrained("sinhala-pos-xlm-r")
tokenizer.save_pretrained("sinhala-pos-xlm-r")


In [None]:
from transformers import pipeline

pos_pipeline = pipeline("token-classification", model="sinhala-pos-xlm-r", tokenizer="sinhala-pos-xlm-r", aggregation_strategy="simple")

sentence = "මම පාසැල යමි"
tokens = sentence.split()  # Assuming simple whitespace tokenization
print(pos_pipeline(tokens))


In [10]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA device count: 1
GPU name: NVIDIA GeForce GTX 1050 Ti
