# Set Up Paths and Read CSV

In [10]:
import pandas as pd
from pathlib import Path

# Load scraped messages
df = pd.read_csv("../data/raw/telegram_data.csv")
df.head()


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,Zemen Express¬Æ,@ZemenExpress,6994,,2025-06-21 16:35:51+00:00,../data/raw/photos\@ZemenExpress_6994.jpg
1,Zemen Express¬Æ,@ZemenExpress,6993,,2025-06-21 16:35:51+00:00,../data/raw/photos\@ZemenExpress_6993.jpg
2,Zemen Express¬Æ,@ZemenExpress,6992,,2025-06-21 16:35:51+00:00,../data/raw/photos\@ZemenExpress_6992.jpg
3,Zemen Express¬Æ,@ZemenExpress,6991,üí•üí•...................................üí•üí•\n\nüìåSa...,2025-06-21 16:35:51+00:00,../data/raw/photos\@ZemenExpress_6991.jpg
4,Zemen Express¬Æ,@ZemenExpress,6990,,2025-06-21 08:07:31+00:00,../data/raw/photos\@ZemenExpress_6990.jpg


# Amharic Text Preprocessing

In [11]:
import re
import emoji

def clean_amharic_text(text):
    if pd.isna(text):
        return ""
    text = emoji.replace_emoji(text, "")
    text = re.sub(r'[^\w\s·ç°·ç¢·ç£·ç§·ç•·ç¶·çß·çº·ç®]+', ' ', text)  # remove Latin punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['Message'].apply(clean_amharic_text)
df[['Channel Title', 'cleaned_text']].sample(5)


Unnamed: 0,Channel Title,cleaned_text
3748,·àç·ã© ·ä•·âÉ,·ã®·çà·àà·åâ·âµ·äï 1 ·ãï·âÉ ·àò·à≠·å†·ãç ·â†·äê·çÉ ·ã≠·ãç·à∞·ã± ·ä®·àã·ã≠ ·ä®·â∞·àà·å†·çâ·âµ 32 ·ä•·âÉ·ãé·âΩ ·ãâ...
2006,·àò·äê·àª·ã¨,·åà·àã·åç·àå ·ã®·àç·åÜ·âΩ ·àù·åç·â• ·àò·àµ·à™·ã´ ·ã®·àç·åÜ·âΩ·äï ·àç·â• ·ã®·àö·ã´·à∏·äï·çç·ç£ ·ã®·ä•·äì·âµ·äï ·åä·ãú ·ã®...
4741,Fashion tera,
2589,·àò·äê·àª·ã¨,Building block the restaurant ·àç·åÜ·âΩ ·ã®·à´·à≥·â∏·ãç·äï ·à¨·àµ·â∂·à´·äï...
77,Zemen Express¬Æ,


# Save Cleaned Data

In [12]:
Path("../data/processed").mkdir(parents=True, exist_ok=True)
df.to_csv("../data/processed/cleaned_telegram_data.csv", index=False, encoding="utf-8")
print("Saved to data/processed/cleaned_telegram_data.csv")


Saved to data/processed/cleaned_telegram_data.csv


#  Load Existing Labeled Files

In [None]:
# Check your labeled files
with open("../data/labels/train.txt", encoding="utf-8") as f:
    lines = f.readlines()
    print("Sample from train.txt:")
    print("".join(lines[:20]))

with open("../data/labels/labeled_telegram_product_price_location.txt", encoding="utf-8") as f:
    labeled_lines = f.readlines()
    print("\nSample from labeled_telegram_product_price_location.txt:")
    print("".join(labeled_lines[:20]))
    


Sample from train.txt:
·ä¢·ã¥·çì B-ORG
·â†·ã®·ä≠·àç·àâ O
·â†·àö·äï·âÄ·à≥·âÄ·àµ·â†·âµ O
·åä·ãú O
·àÅ·àâ O
·ã®·àÄ·åà·à™·â±·äï O
·ä†·å†·âÉ·àã·ã≠ O
·àï·åç·ä•·äï·ã≤·àÅ·àù O
·ã®·ä†·ä´·â£·â¢·ãç·äï O
·â£·àÖ·àç·äì O
·âã·äï·âã O
·ä†·ä≠·â•·àÆ O
·â†·ä†·ä´·â£·â¢·ãç O
·ã®·àö·åà·äô O
·ã®·çñ·àà·â≤·ä´ O
·ãµ·à≠·åÖ·â∂·âΩ·äï·àù O
·ä†·ä≠·â•·àÆ·äì O
·àò·â•·â≥·â∏·ãç·äï O
·å†·â•·âÜ O
·â†·å®·ãã·äê·âµ O


Sample from labeled_telegram_product_price_location.txt:
3pcs B-PRODUCT
silicon I-PRODUCT
brush I-PRODUCT
spatulas I-PRODUCT
·ä•·àµ·ä® O
260¬∞c O
·àô·âÄ·âµ O
·àò·âÜ·âÜ·àù O
·ã®·àö·âΩ·àç O
·ãã·åã-550·â•·à≠ I-PRICE
·ä†·ãµ·à´·àª O
·âÅ.1 O
·àµ·à™ O
·ä§·àù O
·à≤·â≤ O
·àû·àç O
·àÅ·àà·â∞·äõ O
·çé·âÖ O
·â¢·àÆ O
·âÅ. O

