# Set Up Paths and Read CSV

In [10]:
import pandas as pd
from pathlib import Path

# Load scraped messages
df = pd.read_csv("../data/raw/telegram_data.csv")
df.head()


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,Zemen Express®,@ZemenExpress,6994,,2025-06-21 16:35:51+00:00,../data/raw/photos\@ZemenExpress_6994.jpg
1,Zemen Express®,@ZemenExpress,6993,,2025-06-21 16:35:51+00:00,../data/raw/photos\@ZemenExpress_6993.jpg
2,Zemen Express®,@ZemenExpress,6992,,2025-06-21 16:35:51+00:00,../data/raw/photos\@ZemenExpress_6992.jpg
3,Zemen Express®,@ZemenExpress,6991,💥💥...................................💥💥\n\n📌Sa...,2025-06-21 16:35:51+00:00,../data/raw/photos\@ZemenExpress_6991.jpg
4,Zemen Express®,@ZemenExpress,6990,,2025-06-21 08:07:31+00:00,../data/raw/photos\@ZemenExpress_6990.jpg


# Amharic Text Preprocessing

In [11]:
import re
import emoji

def clean_amharic_text(text):
    if pd.isna(text):
        return ""
    text = emoji.replace_emoji(text, "")
    text = re.sub(r'[^\w\s፡።፣፤፥፦፧፼፨]+', ' ', text)  # remove Latin punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['Message'].apply(clean_amharic_text)
df[['Channel Title', 'cleaned_text']].sample(5)


Unnamed: 0,Channel Title,cleaned_text
3748,ልዩ እቃ,የፈለጉትን 1 ዕቃ መርጠው በነፃ ይውሰዱ ከላይ ከተለጠፉት 32 እቃዎች ዉ...
2006,መነሻዬ,ገላግሌ የልጆች ምግብ መስሪያ የልጆችን ልብ የሚያሸንፍ፣ የእናትን ጊዜ የ...
4741,Fashion tera,
2589,መነሻዬ,Building block the restaurant ልጆች የራሳቸውን ሬስቶራን...
77,Zemen Express®,


# Save Cleaned Data

In [12]:
Path("../data/processed").mkdir(parents=True, exist_ok=True)
df.to_csv("../data/processed/cleaned_telegram_data.csv", index=False, encoding="utf-8")
print("Saved to data/processed/cleaned_telegram_data.csv")


Saved to data/processed/cleaned_telegram_data.csv


#  Load Existing Labeled Files

In [None]:
# Check your labeled files
with open("../data/labels/train.txt", encoding="utf-8") as f:
    lines = f.readlines()
    print("Sample from train.txt:")
    print("".join(lines[:20]))

with open("../data/labels/labeled_telegram_product_price_location.txt", encoding="utf-8") as f:
    labeled_lines = f.readlines()
    print("\nSample from labeled_telegram_product_price_location.txt:")
    print("".join(labeled_lines[:20]))
    


Sample from train.txt:
ኢዴፓ B-ORG
በየክልሉ O
በሚንቀሳቀስበት O
ጊዜ O
ሁሉ O
የሀገሪቱን O
አጠቃላይ O
ሕግእንዲሁም O
የአካባቢውን O
ባህልና O
ቋንቋ O
አክብሮ O
በአካባቢው O
የሚገኙ O
የፖለቲካ O
ድርጅቶችንም O
አክብሮና O
መብታቸውን O
ጠብቆ O
በጨዋነት O


Sample from labeled_telegram_product_price_location.txt:
3pcs B-PRODUCT
silicon I-PRODUCT
brush I-PRODUCT
spatulas I-PRODUCT
እስከ O
260°c O
ሙቀት O
መቆቆም O
የሚችል O
ዋጋ-550ብር I-PRICE
አድራሻ O
ቁ.1 O
ስሪ O
ኤም O
ሲቲ O
ሞል O
ሁለተኛ O
ፎቅ O
ቢሮ O
ቁ. O

