In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# 1. Read your labeled CoNLL data
def read_conll(path):
    sentences, labels = [], []
    with open(path, 'r', encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            if line.strip() == '':
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.strip().split()
                tokens.append(splits[0])
                tags.append(splits[-1])
    return sentences, labels

sentences, labels = read_conll("ner_labeled_data.conll")

# 2. Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Davlan/afro-xlmr-base", use_fast=False)

# 3. Convert to Hugging Face Dataset
dataset_dict = {"tokens": sentences, "ner_tags": labels}
dataset = Dataset.from_dict(dataset_dict)

# 4. Label mappings
label_list = list(set(tag for sent in labels for tag in sent))
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

# 5. Tokenization and alignment
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label_to_id[example["ner_tags"][word_idx]])
        else:
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 6. Apply tokenization
tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 7. Load pre-trained model
model = AutoModelForTokenClassification.from_pretrained("Davlan/afro-xlmr-base", num_labels=len(label_list))

# 8. Set training arguments
args = TrainingArguments(
    output_dir="./ner-model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
)

# 9. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 11. Train and evaluate
trainer.train()
trainer.save_model("amharic-ner-finetuned")
metrics = trainer.evaluate()
print(metrics)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ImportError: 
XLMRobertaTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
