<a href="https://colab.research.google.com/github/Liya-F/amharic-ecommerce-data-extractor-w4/blob/googleColab/notebooks/task6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
!pip install --upgrade transformers



In [44]:
!pip install datasets seqeval accelerate



In [45]:
!pip install -U huggingface_hub



In [46]:
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import AutoTokenizer

In [47]:
from google.colab import files
uploaded = files.upload()

Saving labeled_telegram_product_price_location.txt to labeled_telegram_product_price_location (3).txt


In [48]:
def read_conll_file(filename):
    sentences = []
    sentence = []

    with open(filename, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    word, label = parts
                    sentence.append((word, label))
                else:
                    # Skip malformed lines or log them
                    print(f"Skipping malformed line: {line}")

        # Add the last sentence if the file doesn't end with a newline
        if sentence:
            sentences.append(sentence)

    return sentences

# Replace with your actual filename (adjust this if necessary)
filename = list(uploaded.keys())[0]
sentences = read_conll_file(filename)

print(f"✅ Loaded {len(sentences)} labeled sentences")
print("🔎 Example:")
print(sentences[0][:10])  # show first 10 tokens from the first sentence


✅ Loaded 152 labeled sentences
🔎 Example:
[('3pcs', 'B-PRODUCT'), ('silicon', 'I-PRODUCT'), ('brush', 'I-PRODUCT'), ('spatulas', 'I-PRODUCT'), ('እስከ', 'O'), ('260°c', 'O'), ('ሙቀት', 'O'), ('መቆቆም', 'O'), ('የሚችል', 'O'), ('ዋጋ-550ብር', 'I-PRICE')]


In [49]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [50]:
# Label list
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [51]:
def tokenize_and_align(sentences):
    tokenized_inputs = []
    tokenized_labels = []

    for sentence in sentences:
        words, labels = zip(*sentence)

        tokenized = tokenizer(
            list(words),
            is_split_into_words=True,
            truncation=True,
            padding=False,
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        word_ids = tokenized.word_ids(batch_index=0)  # map tokens to word indices
        aligned_labels = []

        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)  # special token
            elif word_idx != prev_word_idx:
                aligned_labels.append(label_to_id[labels[word_idx]])
            else:
                # subword: assign I-XXX or -100
                orig_label = labels[word_idx]
                if orig_label.startswith("B-"):
                    aligned_labels.append(label_to_id[orig_label.replace("B-", "I-")])
                else:
                    aligned_labels.append(label_to_id[orig_label])
            prev_word_idx = word_idx

        tokenized_inputs.append(tokenized)
        tokenized_labels.append(aligned_labels)

    return tokenized_inputs, tokenized_labels

tokenized_inputs, tokenized_labels = tokenize_and_align(sentences)
print("✅ Tokenized and aligned", len(tokenized_inputs), "samples")


✅ Tokenized and aligned 152 samples


In [52]:
from torch.utils.data import random_split
import torch
from datasets import Dataset

# Flatten tokenized_inputs to dictionary format
input_ids = [ti["input_ids"].squeeze() for ti in tokenized_inputs]
attention_masks = [ti["attention_mask"].squeeze() for ti in tokenized_inputs]

# Pack into dictionaries
features = [
    {
        "input_ids": ids,
        "attention_mask": mask,
        "labels": torch.tensor(label, dtype=torch.long)
    }
    for ids, mask, label in zip(input_ids, attention_masks, tokenized_labels)
]

# Split into train and validation
train_size = int(0.9 * len(features))
val_size = len(features) - train_size

train_dataset_raw, val_dataset_raw = random_split(features, [train_size, val_size])

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_list([dict(sample) for sample in train_dataset_raw])
val_dataset = Dataset.from_list([dict(sample) for sample in val_dataset_raw])

print(f"✅ Dataset ready: {len(train_dataset)} train / {len(val_dataset)} validation samples")


✅ Dataset ready: 136 train / 16 validation samples


In [53]:
from transformers import (
    TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForTokenClassification
)

# Map your existing variables to the expected names
label2id = label_to_id
id2label = id_to_label

# Load the model with the correct number of labels
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id)
)
model.config.id2label = id2label
model.config.label2id = label2id

# Data collator for padding batches
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./ner-model-output",
    save_strategy="no",                    # Don't save every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,        # Bigger batch = fewer steps
    per_device_eval_batch_size=16,
    num_train_epochs=1,                    # Just 1 epoch
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="no",                 # Skip logging
    load_best_model_at_end=False,          # Skip model selection
    save_total_limit=1,
    report_to=[],                          # No WandB, no overhead
    fp16=True                              # Faster mixed precision on GPU
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=9, training_loss=1.4934797286987305, metrics={'train_runtime': 2.3261, 'train_samples_per_second': 58.468, 'train_steps_per_second': 3.869, 'total_flos': 17156540705568.0, 'train_loss': 1.4934797286987305, 'epoch': 1.0})

In [54]:
# Save model and tokenizer to directory
model.save_pretrained("./nerr-model")
tokenizer.save_pretrained("./nerr-model")

('./nerr-model/tokenizer_config.json',
 './nerr-model/special_tokens_map.json',
 './nerr-model/sentencepiece.bpe.model',
 './nerr-model/added_tokens.json',
 './nerr-model/tokenizer.json')

In [55]:
trainer.evaluate()

{'eval_loss': 1.125124454498291,
 'eval_runtime': 0.0762,
 'eval_samples_per_second': 210.021,
 'eval_steps_per_second': 13.126,
 'epoch': 1.0}

In [None]:
import pandas as pd

# Load your posts data CSV (replace with your actual file path)
df = pd.read_csv("telegram_posts_with_ner.csv", parse_dates=["timestamp"])

# Ensure price is numeric (if extracted as string, convert and handle errors)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Filter out posts with missing prices for average price calculation
df_price = df.dropna(subset=['price'])

# Group by vendor
vendor_groups = df.groupby('vendor_id')

scorecard = []

for vendor, group in vendor_groups:
    # Calculate posting frequency: posts per week
    total_weeks = (group['timestamp'].max() - group['timestamp'].min()).days / 7
    total_weeks = total_weeks if total_weeks > 0 else 1  # avoid division by zero
    posts_per_week = len(group) / total_weeks

    # Average views per post
    avg_views = group['views'].mean()

    # Top performing post (highest views)
    top_post = group.loc[group['views'].idxmax()]
    top_product = top_post['product']
    top_price = top_post['price']

    # Average price point
    vendor_prices = df_price[df_price['vendor_id'] == vendor]['price']
    avg_price = vendor_prices.mean() if not vendor_prices.empty else 0

    # Simple Lending Score: weighted sum
    lending_score = (avg_views * 0.5) + (posts_per_week * 0.5)

    scorecard.append({
        'vendor_id': vendor,
        'avg_views_per_post': round(avg_views, 2),
        'posts_per_week': round(posts_per_week, 2),
        'avg_price_ETB': round(avg_price, 2),
        'lending_score': round(lending_score, 2),
        'top_product': top_product,
        'top_product_price': top_price
    })

# Create DataFrame and save
scorecard_df = pd.DataFrame(scorecard)

# Save to CSV
scorecard_df.to_csv("vendor_scorecard.csv", index=False)

print("✅ Vendor scorecard saved to vendor_scorecard.csv")
print(scorecard_df.head())