In [11]:
!pip install python-dotenv





In [2]:
import pandas as pd
import sys  
import os
import nest_asyncio
import numpy as np
from telethon.sync import TelegramClient
import re
import emoji
import csv
from dotenv import load_dotenv


# Selecting 5 channels nand Scrape messages 

In [15]:
# Allow nested asyncio loops (needed in notebooks like Jupyter)
nest_asyncio.apply()

# Load environment variables from .env file
load_dotenv('.env')

# Retrieve API credentials
API_ID = os.getenv('TG_API_ID')
API_HASH = os.getenv('TG_API_HASH')

if API_ID is None:
    raise ValueError("API_ID not found in environment variables")
else:
    API_ID = int(API_ID)

if API_HASH is None:
    raise ValueError("API_HASH not found in environment variables")

# Async function to fetch messages and media from a channel
async def fetch_channel_messages(client, channel_handle, csv_writer, download_folder):
    channel_entity = await client.get_entity(channel_handle)
    channel_name = channel_entity.title
    async for msg in client.iter_messages(channel_entity, limit=10000):
        media_file_path = None
        if msg.media and hasattr(msg.media, 'photo'):
            filename = f"{channel_handle}_{msg.id}.jpg"
            media_file_path = os.path.join(download_folder, filename)
            await client.download_media(msg.media, media_file_path)

        csv_writer.writerow([
            channel_name,
            channel_handle,
            msg.id,
            msg.message,
            msg.date,
            media_file_path
        ])

# Main async function to run the scraper
async def run_scraper():
    client = TelegramClient('session_scraper', API_ID, API_HASH)
    await client.start()

    # Define where to save data and images
    base_folder = r"C:\Users\HP\10 Acadamy PRojects\New folder (4)\amharic-ecommerce-scraper\data"
    media_folder = os.path.join(base_folder, 'downloaded_images')
    os.makedirs(media_folder, exist_ok=True)

    csv_path = os.path.join(base_folder, 'output_telegram_data.csv')

    with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Channel Title', 'Channel Handle', 'Message ID', 'Content', 'Timestamp', 'Media File'])

        target_channels = [
            'ZemenExpress',
            'meneshayeofficial',
            'ethio_brand_collection',
            'Shewabrand',
            'qnashcom'
        ]

        for channel in target_channels:
            print(f"Scraping channel: {channel} ...")
            await fetch_channel_messages(client, channel, writer, media_folder)
            print(f"✅ Completed scraping {channel}")

    await client.disconnect()

# Run the scraper in a notebook or async environment
await run_scraper()


Scraping channel: ZemenExpress ...
✅ Completed scraping ZemenExpress
Scraping channel: meneshayeofficial ...
✅ Completed scraping meneshayeofficial
Scraping channel: ethio_brand_collection ...
✅ Completed scraping ethio_brand_collection
Scraping channel: Shewabrand ...
✅ Completed scraping Shewabrand
Scraping channel: qnashcom ...
✅ Completed scraping qnashcom


# Cleaning The Dataset

In [7]:

# Paths
base_dir = r"C:\Users\HP\10 Acadamy PRojects\New folder (4)\amharic-ecommerce-scraper\data"
input_file = os.path.join(base_dir, "output_telegram_data.csv")
output_file = os.path.join(base_dir, "output_telegram_data_cleaned.csv")

# Load data
df = pd.read_csv(input_file, encoding='utf-8')


# Clean Amharic text
def clean_amharic_text(text):
    if pd.isnull(text):
        return ""
    
    text = emoji.replace_emoji(text, replace='')  # Remove emojis
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # URLs
    text = re.sub(r"[A-Za-z]", '', text)  # English
    text = re.sub(r"[^\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF0-9፡።፣፤፥፦፧፨\s]", '', text)
    text = re.sub(r"\s+", " ", text)  # Normalize spaces
    return text.strip()

# Tokenizer
def tokenize_text(text):
    return text.split() if text else []

# Apply
df['clean_text'] = df['Content'].apply(clean_amharic_text)
df['tokens'] = df['clean_text'].apply(tokenize_text)

# Save result
df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"✅ Cleaned data saved to:\n{output_file}")


✅ Cleaned data saved to:
C:\Users\HP\10 Acadamy PRojects\New folder (4)\amharic-ecommerce-scraper\data\output_telegram_data_cleaned.csv


In [21]:
df = pd.read_csv("C:/Users/HP/10 Acadamy PRojects/New folder (4)/amharic-ecommerce-scraper/data/output_telegram_data_cleaned.csv")
df.head()

Unnamed: 0,Channel Title,Channel Handle,Message ID,Content,Timestamp,Media File,clean_text,tokens
0,Zemen Express®,ZemenExpress,6982,💥💥...................................💥💥\n\n📌Im...,2025-06-18 06:01:10+00:00,,በኤሌክትሪክየሚሰራ ለቤት መልካም መዓዛን የሚሰጥ ዋጋ፦ 1400 ብር ውስን...,"['በኤሌክትሪክየሚሰራ', 'ለቤት', 'መልካም', 'መዓዛን', 'የሚሰጥ',..."
1,Zemen Express®,ZemenExpress,6981,💥💥...................................💥💥\n\n📌 B...,2025-06-16 12:21:00+00:00,,በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ዋጋ፦ 2400 ብር ው...,"['በፈለጉት', 'አቅጣጫ', 'ልጅዎን', 'በምቾት', 'ማዘል', 'ያስችል..."
2,Zemen Express®,ZemenExpress,6980,,2025-06-16 05:11:57+00:00,C:\Users\HP\10 Acadamy PRojects\New folder (4)...,,[]
3,Zemen Express®,ZemenExpress,6979,,2025-06-16 05:11:57+00:00,C:\Users\HP\10 Acadamy PRojects\New folder (4)...,,[]
4,Zemen Express®,ZemenExpress,6978,,2025-06-16 05:11:57+00:00,C:\Users\HP\10 Acadamy PRojects\New folder (4)...,,[]


# Selecting 50 Massages from the DF

In [13]:
import pandas as pd
import os

# Load cleaned data
base_dir = r"C:\Users\HP\10 Acadamy PRojects\New folder (4)\amharic-ecommerce-scraper\data"
input_file = os.path.join(base_dir, "output_telegram_data_cleaned.csv")
df = pd.read_csv(input_file)

# Define keyword lists
price_keywords = ['ብር', 'ዋጋ', 'በ']
product_keywords = ['ልጅ', 'ሻርተ', 'እቃ', 'ቦትል', 'ኩባያ']
location_keywords = ['አዲስ', 'ቦሌ', 'ሀዋሳ', 'መጫኛ', 'ትኩሳት']

# Combine them
all_keywords = price_keywords + product_keywords + location_keywords

# Function to check if any keyword exists in text
def contains_keywords(text, keywords):
    return any(kw in str(text) for kw in keywords)

# Filter rows with relevant content
filtered_df = df[df['clean_text'].apply(lambda x: contains_keywords(x, all_keywords))]

# Limit to 50 messages
filtered_sample = filtered_df.sample(n=50, random_state=42)

# Save
output_sample_path = os.path.join(base_dir, "ner_labeling_sample.csv")
filtered_sample.to_csv(output_sample_path, index=False, encoding='utf-8-sig')

print(f"✅ Filtered 50 messages and saved to:\n{output_sample_path}")


✅ Filtered 50 messages and saved to:
C:\Users\HP\10 Acadamy PRojects\New folder (4)\amharic-ecommerce-scraper\data\ner_labeling_sample.csv


In [19]:
df = pd.read_csv("C:/Users/HP/10 Acadamy PRojects/New folder (4)/amharic-ecommerce-scraper/data/ner_labeling_sample.csv")
df.head()

Unnamed: 0,Channel Title,Channel Handle,Message ID,Content,Timestamp,Media File,clean_text,tokens
0,Zemen Express®,ZemenExpress,4150,💥💥...................................💥💥\n\n📌Fl...,2024-03-23 09:34:00+00:00,,የራሱ ሪሞት ያለው ከስቲም በተጨማሪ ውብ ብርሀን የሚሰጥ 200 ውሀ የመያ...,"['የራሱ', 'ሪሞት', 'ያለው', 'ከስቲም', 'በተጨማሪ', 'ውብ', '..."
1,qnash.com - ቅናሽ ®️,qnashcom,1686,🤩 ታላቅ ቅናሽ 💯 🤩 \n❇️ Universal water-saving dish...,2022-05-31 05:56:50+00:00,,ታላቅ ቅናሽ 360 100 ዋጋ፦ 350 ብር ውስን ፍሬ ነው የቀረው ጥራት ...,"['ታላቅ', 'ቅናሽ', '360', '100', 'ዋጋ፦', '350', 'ብር..."
2,Zemen Express®,ZemenExpress,999,🎯Electric Coffee Maker\n🔰300ml የሚይዝ ለ 6 ስኒ\n\n...,2022-02-03 08:22:27+00:00,C:\Users\HP\10 Acadamy PRojects\New folder (4)...,300 የሚይዝ ለ 6 ስኒ ዋጋ፦ 1450 ብር አድራሻ መገናኛ ደራርቱ ህንፃ...,"['300', 'የሚይዝ', 'ለ', '6', 'ስኒ', 'ዋጋ፦', '1450',..."
3,Shewa Brand,Shewabrand,3710,jordan 1 original 💯 \nSize 40#41#42#43\nMADE I...,2025-06-20 06:15:40+00:00,C:\Users\HP\10 Acadamy PRojects\New folder (4)...,1 40414243 አድራሻ ድሬዳዋ አሸዋ ሚና ህንፃ 1ኛ ፎቅ ላይ እንገኛለ...,"['1', '40414243', 'አድራሻ', 'ድሬዳዋ', 'አሸዋ', 'ሚና',..."
4,qnash.com - ቅናሽ ®️,qnashcom,3282,😃 የሸቶ መቀነሻ Mini spray pour\n\nየሸቶ መቀነሻ 👉\n\n ...,2023-10-13 07:30:00+00:00,,የሸቶ መቀነሻ የሸቶ መቀነሻ 200 ብር ውስን ፍሬ ነው የቀረው ጥራት ዋስ...,"['የሸቶ', 'መቀነሻ', 'የሸቶ', 'መቀነሻ', '200', 'ብር', 'ው..."
