In [1]:
!pip install --upgrade transformers datasets accelerate evaluate seqeval telethon pandas

from google.colab import drive
import pandas as pd
import numpy as np
import os
import asyncio
import nest_asyncio
from telethon.sync import TelegramClient
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate

print("--- All libraries upgraded and imported. ---")

--- All libraries upgraded and imported. ---


In [2]:
drive.mount('/content/drive')
nest_asyncio.apply()

scraped_data_path = '/content/drive/My Drive/Data/scraped_telegram_data.csv'
df_scraped = pd.read_csv(scraped_data_path)
print("Scraped data loaded from CSV.")

def load_conll_file(file_path):
    tokens_list, tags_list = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        current_tokens, current_tags = [], []
        for line in f:
            line = line.strip()
            if line == "":
                if current_tokens:
                    tokens_list.append(current_tokens)
                    tags_list.append(current_tags)
                    current_tokens, current_tags = [], []
            else:
                parts = line.split(maxsplit=1)
                if len(parts) == 2:
                    current_tokens.append(parts[0])
                    current_tags.append(parts[1])
        if current_tokens:
            tokens_list.append(current_tokens)
            tags_list.append(current_tags)
    return {"tokens": tokens_list, "ner_tags": tags_list}

labeled_file_path = '/content/drive/My Drive/Data/amharic_ner_train.txt'
data = load_conll_file(labeled_file_path)
print("Labeled CoNLL data loaded.")

Mounted at /content/drive
Scraped data loaded from CSV.
Labeled CoNLL data loaded.


In [3]:
# STEP 6: Create label mappings
unique_tags = sorted(list(set(tag for tags in data['ner_tags'] for tag in tags)))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for i, tag in enumerate(unique_tags)}

# STEP 7: Create Hugging Face Dataset object
data['ner_tags_ids'] = [[tag2id[tag] for tag in tags] for tags in data['ner_tags']]
raw_dataset = Dataset.from_dict({
    "id": range(len(data['tokens'])),
    "tokens": data['tokens'],
    "ner_tags": data['ner_tags_ids']
})

print("Raw dataset created:")
print(raw_dataset)
print("Label mapping:", id2tag)

Raw dataset created:
Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 65
})
Label mapping: {0: 'B-LOC', 1: 'B-PRICE', 2: 'B-PRODUCT', 3: 'I-LOC', 4: 'I-PRICE', 5: 'I-PRODUCT', 6: 'O'}


In [4]:
# STEP 8: Define model and tokenizer, then tokenize the data
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None: label_ids.append(-100)
            elif word_idx != previous_word_idx: label_ids.append(label[word_idx])
            else: label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = raw_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)
print("Datasets tokenized and split.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Datasets tokenized and split.


In [6]:
# STEP 9: Define everything for the Trainer and start training
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(id2tag), id2label=id2tag, label2id=tag2id
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [[id2tag[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[id2tag[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

training_args = TrainingArguments(
    output_dir="./results_xlm_roberta",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("/content/drive/My Drive/Data/models/xlm-roberta-amharic-ner")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


In [7]:
# CELL 6 - Manual Evaluation

print("--- Training complete. Now running manual evaluation. ---")

predictions = trainer.predict(tokenized_datasets["test"])

final_metrics = compute_metrics((predictions.predictions, predictions.label_ids))

print("\n--- Final Evaluation Metrics ---")
print(final_metrics)

--- Training complete. Now running manual evaluation. ---



--- Final Evaluation Metrics ---
{'precision': np.float64(0.06451612903225806), 'recall': np.float64(0.04878048780487805), 'f1': np.float64(0.05555555555555555), 'accuracy': 0.5772058823529411}


In [14]:
import time
import pandas as pd
import numpy as np
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoTokenizer


def fine_tune_and_evaluate(model_checkpoint, tokenized_datasets, id2tag, tag2id):
    """
    Fine-tunes a model and returns its performance metrics and training time.
    """
    print(f"--- Starting fine-tuning for: {model_checkpoint} ---")

    # The simpler, original code
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint, num_labels=len(id2tag), id2label=id2tag, label2id=tag2id
    )

    # Re-tokenize data for the specific model's tokenizer
    def tokenize_and_align(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None: label_ids.append(-100)
                elif word_idx != previous_word_idx: label_ids.append(label[word_idx])
                else: label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_data_for_model = tokenized_datasets.map(tokenize_and_align, batched=True, remove_columns=tokenized_datasets['train'].column_names)

    # Use the simple TrainingArguments that we know works
    training_args = TrainingArguments(
        output_dir=f"./results_{model_checkpoint.replace('/', '_')}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        learning_rate=2e-5,
        weight_decay=0.01,
        report_to="none",
    )

    # Data Collator
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # Trainer - REMOVED EVALUATION ARGS
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data_for_model["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train and time it
    start_time = time.time()
    trainer.train()
    end_time = time.time()
    training_time = end_time - start_time

    # Manually evaluate on the test set AFTER training is done
    print("--- Training complete. Evaluating on the test set. ---")
    predictions = trainer.predict(tokenized_data_for_model["test"])
    eval_results = compute_metrics((predictions.predictions, predictions.label_ids))

    # Save the model
    model_save_path = f"/content/drive/My Drive/Data/models/{model_checkpoint.replace('/', '_')}"
    trainer.save_model(model_save_path)
    print(f"Model saved to {model_save_path}")

    return {
        "model": model_checkpoint,
        "f1-score": eval_results["f1"],
        "precision": eval_results["precision"],
        "recall": eval_results["recall"],
        "training_time_sec": training_time,
    }

models_to_compare = [
    "xlm-roberta-base",
    "bert-base-multilingual-cased",
    "disentangle/bert-tiny-amharic"
]

results_list = []
for model_name in models_to_compare:
    result = fine_tune_and_evaluate(model_name, tokenized_datasets, id2tag, tag2id)
    results_list.append(result)

df_results = pd.DataFrame(results_list)
print("\n--- Model Comparison Results ---")
print(df_results)

df_results.to_csv('/content/drive/My Drive/Data/model_comparison_results.csv', index=False)

--- Starting fine-tuning for: xlm-roberta-base ---


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss


--- Training complete. Evaluating on the test set. ---


Model saved to /content/drive/My Drive/Data/models/xlm-roberta-base
--- Starting fine-tuning for: bert-base-multilingual-cased ---


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss


--- Training complete. Evaluating on the test set. ---


Model saved to /content/drive/My Drive/Data/models/bert-base-multilingual-cased
--- Starting fine-tuning for: disentangle/bert-tiny-amharic ---


OSError: disentangle/bert-tiny-amharic is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [15]:
!pip install shap -q

In [16]:
import shap
from transformers import pipeline

best_model_path = "/content/drive/My Drive/Data/models/xlm-roberta-base"

ner_pipeline = pipeline("ner", model=best_model_path, aggregation_strategy="simple")

explainer = shap.Explainer(ner_pipeline)

sample_text = "Lenovo ላፕቶፕ ዋጋ 30000 ብር ቦሌ ይገኛል"


shap_values = explainer([sample_text])

print("--- SHAP Explanation Plot ---")
shap.plots.text(shap_values)

Device set to use cpu


KeyError: 'label'

In [None]:
import pandas as pd
import re
from tqdm.auto import tqdm

df_scraped = pd.read_csv('/content/drive/My Drive/Data/scraped_telegram_data.csv')
df_scraped['date'] = pd.to_datetime(df_scraped['date'])

# --- 3. Create a helper function to extract price ---
def extract_price(text, nlp_pipeline):
    """
    Extracts the first price found in a text using the NER pipeline.
    Cleans the price to be a numeric value.
    """
    try:
        entities = nlp_pipeline(text)
        for entity in entities:
            if entity['entity_group'] == 'PRICE':
                price_str = entity['word']
                cleaned_price = re.sub(r'[^\d.]', '', price_str)
                if cleaned_price:
                    return float(cleaned_price)
    except Exception as e:

        return None
    return None


tqdm.pandas(desc="Extracting Prices")
df_scraped['extracted_price'] = df_scraped['text'].progress_apply(lambda x: extract_price(x, ner_pipeline))

vendor_analytics = []
grouped = df_scraped.groupby('channel').filter(lambda x: len(x) > 1)

for channel, group_df in grouped.groupby('channel'):
    time_span_days = (group_df['date'].max() - group_df['date'].min()).days
    if time_span_days == 0:
        time_span_days = 1

    posting_frequency_weekly = (len(group_df) / time_span_days) * 7

    avg_views_per_post = group_df['views'].mean()
    avg_price_point = group_df['extracted_price'].mean()

    vendor_analytics.append({
        'Vendor Channel': channel,
        'Total Posts Scraped': len(group_df),
        'Avg. Views/Post': avg_views_per_post,
        'Posts/Week': posting_frequency_weekly,
        'Avg. Price (ETB)': avg_price_point
    })

df_scorecard = pd.DataFrame(vendor_analytics)

df_scorecard['norm_views'] = df_scorecard['Avg. Views/Post'] / df_scorecard['Avg. Views/Post'].max()
df_scorecard['norm_freq'] = df_scorecard['Posts/Week'] / df_scorecard['Posts/Week'].max()

df_scorecard['Lending Score'] = (df_scorecard['norm_views'] * 0.5) + (df_scorecard['norm_freq'] * 0.5)

final_report_table = df_scorecard[['Vendor Channel', 'Avg. Views/Post', 'Posts/Week', 'Avg. Price (ETB)', 'Lending Score']]
final_report_table = final_report_table.sort_values(by='Lending Score', ascending=False).reset_index(drop=True)

print("\n--- FinTech Vendor Scorecard ---")
print(final_report_table.to_string(formatters={
    'Avg. Views/Post': '{:,.0f}'.format,
    'Posts/Week': '{:,.1f}'.format,
    'Avg. Price (ETB)': '{:,.2f}'.format,
    'Lending Score': '{:,.3f}'.format
}))