# Define Labels and Load CoNLL Data

In [1]:
# Load and parse CoNLL data
def parse_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                try:
                    token, tag = line.split()
                    tokens.append(token)
                    tags.append(tag)
                except ValueError:
                    # If any line doesn't contain two elements
                    print(f"Skipping bad line: {line}")
        if tokens:  # Catch last sentence if file doesn't end with newline
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

# Path to your labeled file
conll_file_path = "../data/labels/conll_labeled_subset.txt"  # Update if needed

# Load data
sentences, ner_tags = parse_conll(conll_file_path)

# Automatically extract labels
all_labels = set(tag for tags in ner_tags for tag in tags)
label_list = sorted(all_labels)
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# Confirm it worked
print("Label list:", label_list)
print("sentence:", list(zip(sentences[0], ner_tags[0])))


Label list: ['B-PRODUCT', 'I-LOC', 'I-PRICE', 'I-PRODUCT', 'O']
sentence: [('3pcs', 'B-PRODUCT'), ('silicon', 'I-PRODUCT'), ('brush', 'I-PRODUCT'), ('spatulas', 'I-PRODUCT'), ('እስከ', 'O'), ('260°c', 'O'), ('ሙቀት', 'O'), ('መቆቆም', 'O'), ('የሚችል', 'O'), ('ዋጋ-550ብር', 'I-PRICE'), ('አድራሻ', 'O'), ('ቁ.1', 'O'), ('ስሪ', 'O'), ('ኤም', 'O'), ('ሲቲ', 'O'), ('ሞል', 'O'), ('ሁለተኛ', 'O'), ('ፎቅ', 'O'), ('ቢሮ', 'O'), ('ቁ.', 'O'), ('SL-05A(ከ', 'O'), ('ሊፍቱ', 'O'), ('ፊት', 'O'), ('ለ', 'O'), ('ፊት)', 'O'), ('ቁ.2', 'O'), ('ለቡ', 'I-LOC'), ('መዳህኒዓለም', 'O'), ('ቤተ/ክርስቲያን', 'O'), ('ፊት', 'O'), ('ለፊት', 'O'), ('#ዛም_ሞል', 'O'), ('2ኛ', 'O'), ('ፎቅ', 'O'), ('ቢሮ', 'O'), ('ቁጥር.214', 'O'), ('ለቡ', 'I-LOC'), ('ቅርንጫፍ0973611819', 'O'), ('0909522840', 'O'), ('0923350054', 'O'), ('በTelegram', 'O'), ('ለማዘዝ', 'O'), ('ይጠቀሙ', 'O'), ('@shager_onlinestore', 'O'), ('ለተጨማሪ', 'O'), ('ማብራሪያ', 'O'), ('የቴሌግራም', 'O'), ('ገፃችን', 'O'), ('https://t.me/Shageronlinestore', 'O')]


# Tokenize & Align Labels

In [2]:
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"  # alternatives: "Davlan/bert-tiny-amharic", "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize + align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # ignored in loss computation
            elif word_id != prev_word_id:
                label_ids.append(label_to_id[label[word_id]])
            else:
                label_ids.append(label_to_id[label[word_id]] if label[word_id].startswith("I") else -100)
            prev_word_id = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Prepare Dataset and Tokenize

In [3]:
from datasets import Dataset

# Create Hugging Face dataset
data = Dataset.from_dict({"tokens": sentences, "ner_tags": ner_tags})

# Split into train/validation
data = data.train_test_split(test_size=0.2)
train_dataset = data["train"]
val_dataset = data["test"]

# Tokenize
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

# Define Model and Trainer

In [None]:

from transformers import TrainingArguments,AutoModelForTokenClassification,Trainer
from seqeval.metrics import classification_report

# Load model with appropriate number of labels
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# Training arguments
args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
)

# Define metrics
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_labels, true_preds = [], []
    for label, pred in zip(labels, preds):
        true_label = []
        true_pred = []
        for l, p in zip(label, pred):
            if l != -100:
                true_label.append(id_to_label[l])
                true_pred.append(id_to_label[p])
        true_labels.append(true_label)
        true_preds.append(true_pred)
    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



# Train the Model

In [None]:
trainer.train()

# Evaluate and Save

In [None]:
# Evaluate
metrics = trainer.evaluate()
print(metrics)

# Save model for future inference
trainer.save_model("./ner_amharic_model")
tokenizer.save_pretrained("./ner_amharic_model")
