In [4]:
import pandas as pd
import re
import os
from datasets import Dataset

os.makedirs('data/labeled', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

raw_data = pd.read_csv('../data/processed/processed_data_20250622_231744.csv')

product_patterns = ['የፍራፍሬ', 'ሶፋና', 'አልጋ', 'የቀለሜ', 'መነሻ', 'መጥበሻ', 'ጆግና', 'የሕፃን', 'የሽንት', 'ምንጣፍ']
price_patterns = [r'\d+\s*ብር', r'\d+']
location_patterns = ['ገርጂ', 'መገናኛ', '4ኪሎ', 'ልደታ', 'ድሬዳዋ', 'መሰረት', 'ደፋር', 'ሞል', 'ሜክሲኮ']

def label_message(text):
    if not isinstance(text, str):
        return {'tokens': [], 'ner_tags': []}
    tokens = text.split()
    labels = ['O'] * len(tokens)
    for idx, token in enumerate(tokens):
        if any(re.match(pattern, token) for pattern in price_patterns):
            labels[idx] = 'B-PRICE'
            if idx + 1 < len(tokens) and tokens[idx + 1] == 'ብር':
                labels[idx + 1] = 'I-PRICE'
        elif any(product in token for product in product_patterns):
            labels[idx] = 'B-Product'
            for next_idx in range(idx + 1, len(tokens)):
                if next_idx == len(tokens) - 1 or tokens[next_idx] in ['እና', '፤', '፣']:
                    break
                labels[next_idx] = 'I-Product'
        elif any(loc in token for loc in location_patterns):
            labels[idx] = 'B-LOC'
            for next_idx in range(idx + 1, len(tokens)):
                if next_idx == len(tokens) - 1 or tokens[next_idx] in ['እና', '፤', '፣']:
                    break
                labels[next_idx] = 'I-LOC'
    return {'tokens': tokens, 'ner_tags': labels}

labeled_data = raw_data['Message Text'].apply(label_message)
data_dict = {'tokens': labeled_data.apply(lambda x: x['tokens']).tolist(),
             'ner_tags': labeled_data.apply(lambda x: x['ner_tags']).tolist()}

data_dict = {'tokens': [t for t in data_dict['tokens'] if t],
             'ner_tags': [l for l in data_dict['ner_tags'] if l]}

with open('../data/labeled/relabeled_data_20250622_232809.conll', 'w', encoding='utf-8') as f:
    for tokens, labels in zip(data_dict['tokens'], data_dict['ner_tags']):
        for token, label in zip(tokens, labels):
            f.write(f"{token} {label}\n")
        f.write("\n")


In [5]:
dataset = Dataset.from_dict(data_dict)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset.save_to_disk('../data/processed/tokenized_dataset')

pd.DataFrame({'tokens': dataset['train']['tokens'][:5], 'ner_tags': dataset['train']['ner_tags'][:5]})


Saving the dataset (1/1 shards): 100%|██████████| 364/364 [00:00<00:00, 11065.80 examples/s]


Saving the dataset (1/1 shards): 100%|██████████| 92/92 [00:00<00:00, 4190.75 examples/s]


Unnamed: 0,tokens,ner_tags
0,"[8️⃣pcs, Glass, Kettle, Set, ⭐️ማራኪ, እና, በዉብ, ዲ...","[B-PRICE, O, O, O, O, O, O, O, B-PRICE, O, O, ..."
1,"[ከስምንት, አመት, ጀምሮ, ላሉ, ልጆች, የሚመከር, 📱0989939393,...","[O, O, O, O, O, O, O, O, B-PRICE, O, O, O, O, O]"
2,"[NIKE, PENNY, 1, original, 💯, Size, 40#41#42#4...","[O, O, B-PRICE, O, O, O, B-PRICE, O, O, O, O, ..."
3,"[ድክዬ, ባለ, ድምፅ, የህፃናት, መመሪያ!!, ልጆችዎ, ድክ, ድክ, ብለ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[Under, armour, Curry, 11ORIGINAL, 💯, Size, 40...","[O, O, O, B-PRICE, O, O, B-PRICE, O, O, O, O, ..."
