### Import Dependencies

In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorForTokenClassification
)
from sklearn.metrics import classification_report, accuracy_score
import torch
from typing import List, Dict
import logging
import wandb  # For experiment tracking

logging.basicConfig(level=logging.INFO)

KeyboardInterrupt: 

### Data Loading and Preprocessing

In [None]:
class POSDataLoader:
    def __init__(self, file_path: str):
        self.file_path = file_path
        
    def load_data(self) -> List[Dict]:
        dataset = []
        with open(self.file_path, "r", encoding="utf-8") as f:
            tokens, tags = [], []
            for line in f:
                line = line.strip()
                if not line and tokens:
                    dataset.append({"tokens": tokens, "tags": tags})
                    tokens, tags = [], []
                elif line:
                    try:
                        word, tag = line.split()
                        tokens.append(word)
                        tags.append(tag)
                    except ValueError:
                        logging.warning(f"Skipping malformed line: {line}")
            
            if tokens:  # Handle last sentence
                dataset.append({"tokens": tokens, "tags": tags})
        
        logging.info(f"Loaded {len(dataset)} sentences")
        return dataset

    def create_tag_maps(self, data: List[Dict]):
        tags = set(tag for example in data for tag in example["tags"])
        tag2id = {tag: i for i, tag in enumerate(sorted(tags))}
        id2tag = {i: tag for tag, i in tag2id.items()}
        return tag2id, id2tag

### Dataset Creation and Tokenization

In [None]:
class POSDatasetProcessor:
    def __init__(self, tokenizer_name: str, max_length: int = 128):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
        
    def tokenize_and_align_labels(self, examples, tag2id):
        tokenized = self.tokenizer(
            examples["tokens"],
            is_split_into_words=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_offsets_mapping=True
        )
        
        labels = []
        for i, label in enumerate(examples["tags"]):
            word_ids = tokenized.word_ids(i)
            previous_word_idx = None
            label_ids = []
            
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(tag2id[label[word_idx]])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            
            labels.append(label_ids)
        
        tokenized["labels"] = labels
        return tokenized

    def prepare_dataset(self, data: List[Dict], tag2id: Dict, val_split: float = 0.1, test_split: float = 0.1):
        dataset = Dataset.from_dict({
            'tokens': [example['tokens'] for example in data],
            'tags': [example['tags'] for example in data]
        })
        
        # Create train/val/test splits
        splits = dataset.train_test_split(test_size=test_split + val_split)
        train_test = splits['train'].train_test_split(test_size=val_split/(1-test_split))
        
        dataset_dict = DatasetDict({
            'train': train_test['train'],
            'validation': train_test['test'],
            'test': splits['test']
        })
        
        # Tokenize datasets
        tokenized_datasets = dataset_dict.map(
            lambda x: self.tokenize_and_align_labels(x, tag2id),
            batched=True,
            remove_columns=dataset_dict['train'].column_names
        )
        
        return tokenized_datasets

### Model Training and Evaluation

In [None]:
class POSTrainer:
    def __init__(self, model_name: str, num_labels: int, id2tag: Dict, tag2id: Dict):
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            id2label=id2tag,
            label2id=tag2id
        )
        
        self.training_args = TrainingArguments(
            output_dir="./sinhala-pos-model",
            evaluation_strategy="steps",
            eval_steps=100,
            logging_steps=100,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=5,
            weight_decay=0.01,
            warmup_ratio=0.1,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            save_total_limit=2,
            report_to="wandb"
        )
        
    def compute_metrics(self, p):
        predictions = np.argmax(p.predictions, axis=2)
        labels = p.label_ids
        
        true_predictions = [
            [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        
        results = {
            "accuracy": accuracy_score(sum(true_labels, []), sum(true_predictions, [])),
        }
        
        # Add detailed classification metrics
        report = classification_report(
            sum(true_labels, []), 
            sum(true_predictions, []), 
            output_dict=True
        )
        
        results.update({
            f"f1_{k}": v['f1-score']
            for k, v in report.items()
            if k not in ['accuracy', 'macro avg', 'weighted avg']
        })
        
        return results

    def train(self, tokenized_datasets, tokenizer):
        data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
        
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )
        
        trainer.train()
        return trainer

### Main Execution

In [None]:
if __name__ == "__main__":
    # Initialize wandb
    wandb.init(project="sinhala-pos-tagger")
    
    # Load data
    loader = POSDataLoader("sinhala_pos.txt")
    data = loader.load_data()
    tag2id, id2tag = loader.create_tag_maps(data)
    
    # Process dataset
    processor = POSDatasetProcessor("xlm-roberta-base")
    tokenized_datasets = processor.prepare_dataset(data, tag2id)
    
    # Train model
    trainer = POSTrainer(
        model_name="xlm-roberta-base",
        num_labels=len(tag2id),
        id2tag=id2tag,
        tag2id=tag2id
    )
    
    trained_trainer = trainer.train(tokenized_datasets, processor.tokenizer)
    
    # Evaluate
    results = trained_trainer.evaluate(tokenized_datasets["test"])
    print("Final Evaluation Results:", results)
    
    # Save model
    trained_trainer.save_model("final-sinhala-pos-model")
    processor.tokenizer.save_pretrained("final-sinhala-pos-model")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models


ValueError: API key must be 40 characters long, yours was 12

### Inference Example

In [None]:
def predict_pos(text: str, model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    
    tokens = text.split()
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
    
    results = []
    word_ids = inputs.word_ids()
    for token, prediction in zip(tokens, predictions[0][1:-1]):  # Skip special tokens
        tag = model.config.id2label[prediction.item()]
        results.append((token, tag))
    
    return results