<a href="https://colab.research.google.com/github/Liya-F/amharic-ecommerce-data-extractor-w4/blob/googleColab/notebooks/googleColab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Install dependencies (run once)
!pip install --upgrade transformers datasets seqeval accelerate huggingface_hub




In [2]:
# Cell 1: Install dependencies (run once)
!pip install --upgrade transformers datasets seqeval accelerate huggingface_hub




In [3]:
# Cell 2: Imports and utility functions
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
from torch.utils.data import random_split

# Read CoNLL-style file
def read_conll_file(filename):
    sentences = []
    sentence = []
    with open(filename, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    word, label = parts
                    sentence.append((word, label))
                else:
                    print(f"Skipping malformed line: {line}")
        if sentence:
            sentences.append(sentence)
    return sentences

# Tokenize and align labels with tokenizer subwords
def tokenize_and_align(sentences, tokenizer, label_to_id):
    tokenized_inputs = []
    tokenized_labels = []
    for sentence in sentences:
        words, labels = zip(*sentence)
        tokenized = tokenizer(
            list(words),
            is_split_into_words=True,
            truncation=True,
            padding=False,
            return_offsets_mapping=True,
            return_tensors="pt"
        )
        word_ids = tokenized.word_ids(batch_index=0)
        aligned_labels = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)  # special token
            elif word_idx != prev_word_idx:
                aligned_labels.append(label_to_id[labels[word_idx]])
            else:
                orig_label = labels[word_idx]
                if orig_label.startswith("B-"):
                    aligned_labels.append(label_to_id[orig_label.replace("B-", "I-")])
                else:
                    aligned_labels.append(label_to_id[orig_label])
            prev_word_idx = word_idx
        tokenized_inputs.append(tokenized)
        tokenized_labels.append(aligned_labels)
    return tokenized_inputs, tokenized_labels

# Prepare HuggingFace datasets from tokenized inputs and labels
def prepare_datasets(sentences, tokenizer, label_to_id):
    tokenized_inputs, tokenized_labels = tokenize_and_align(sentences, tokenizer, label_to_id)
    input_ids = [ti["input_ids"].squeeze() for ti in tokenized_inputs]
    attention_masks = [ti["attention_mask"].squeeze() for ti in tokenized_inputs]
    features = [
        {"input_ids": ids, "attention_mask": mask, "labels": torch.tensor(label, dtype=torch.long)}
        for ids, mask, label in zip(input_ids, attention_masks, tokenized_labels)
    ]
    train_size = int(0.9 * len(features))
    val_size = len(features) - train_size
    train_dataset_raw, val_dataset_raw = random_split(features, [train_size, val_size])
    train_dataset = Dataset.from_list([dict(sample) for sample in train_dataset_raw])
    val_dataset = Dataset.from_list([dict(sample) for sample in val_dataset_raw])
    return train_dataset, val_dataset


In [4]:
# Cell 3: Upload and load your data file
from google.colab import files
uploaded = files.upload()  # upload your .conll or similar file here
filename = list(uploaded.keys())[0]

sentences = read_conll_file(filename)
print(f"✅ Loaded {len(sentences)} labeled sentences")
print("🔎 Example (first 10 tokens of first sentence):")
print(sentences[0][:10])

# Define your label list here
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}


Saving labeled_telegram_product_price_location.txt to labeled_telegram_product_price_location (1).txt
✅ Loaded 152 labeled sentences
🔎 Example (first 10 tokens of first sentence):
[('3pcs', 'B-PRODUCT'), ('silicon', 'I-PRODUCT'), ('brush', 'I-PRODUCT'), ('spatulas', 'I-PRODUCT'), ('እስከ', 'O'), ('260°c', 'O'), ('ሙቀት', 'O'), ('መቆቆም', 'O'), ('የሚችል', 'O'), ('ዋጋ-550ብር', 'I-PRICE')]


In [5]:
# Cell 4: Define a reusable function to train and evaluate a model by name
def train_and_evaluate_model(model_name, sentences, label_to_id, id_to_label, output_dir):
    print(f"\n\n=== Training model: {model_name} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset, val_dataset = prepare_datasets(sentences, tokenizer, label_to_id)

    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_to_id))
    model.config.id2label = id_to_label
    model.config.label2id = label_to_id

    training_args = TrainingArguments(
        output_dir=output_dir,
        save_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        weight_decay=0.01,
        logging_strategy="no",
        fp16=True,
        report_to=[],
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Evaluation results for {model_name}:\n", eval_results)

    # Save model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return eval_results


In [6]:
# Cell 5: Train and evaluate XLM-Roberta-base
eval_xlm = train_and_evaluate_model(
    "xlm-roberta-base",
    sentences,
    label_to_id,
    id_to_label,
    "./xlm-roberta-output"
)




=== Training model: xlm-roberta-base ===


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


Evaluation results for xlm-roberta-base:
 {'eval_loss': 1.523220181465149, 'eval_runtime': 0.1111, 'eval_samples_per_second': 144.063, 'eval_steps_per_second': 9.004, 'epoch': 1.0}


In [7]:
# Cell 6: Train and evaluate DistilBERT-base-uncased
eval_distilbert = train_and_evaluate_model(
    "distilbert-base-uncased",
    sentences,
    label_to_id,
    id_to_label,
    "./distilbert-output"
)




=== Training model: distilbert-base-uncased ===


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


Evaluation results for distilbert-base-uncased:
 {'eval_loss': 0.9017500281333923, 'eval_runtime': 0.0216, 'eval_samples_per_second': 739.875, 'eval_steps_per_second': 46.242, 'epoch': 1.0}


In [8]:
# Cell 7: Compare results
print("\n\n=== Model Comparison ===")
print("XLM-Roberta evaluation:", eval_xlm)
print("DistilBERT evaluation:", eval_distilbert)




=== Model Comparison ===
XLM-Roberta evaluation: {'eval_loss': 1.523220181465149, 'eval_runtime': 0.1111, 'eval_samples_per_second': 144.063, 'eval_steps_per_second': 9.004, 'epoch': 1.0}
DistilBERT evaluation: {'eval_loss': 0.9017500281333923, 'eval_runtime': 0.0216, 'eval_samples_per_second': 739.875, 'eval_steps_per_second': 46.242, 'epoch': 1.0}


In [10]:
!pip install shap transformers

import shap
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Load your fine-tuned NER model
model_path = "./distilbert-output"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Wrap the pipeline
class NerWrapper:
    def __init__(self, pipeline):
        self.pipeline = pipeline

    def __call__(self, texts):
        return np.array([[1.0 if self.pipeline(text) else 0.0] for text in texts])

# Use SHAP
explainer = shap.Explainer(NerWrapper(ner_pipeline), tokenizer)
sample_text = ["I bought a new iPhone in Addis Ababa for 30000 birr."]
shap_values = explainer(sample_text)

# Show token importance
shap.plots.text(shap_values[0])




Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  0%|          | 0/306 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:12, 12.58s/it]               
