In [5]:
# ---[ Environment Setup ]---
import os
import sys
import pandas as pd
import re
from tqdm import tqdm
from dotenv import load_dotenv
import nest_asyncio
from IPython.display import display, HTML

# Enable asyncio compatibility in notebooks
nest_asyncio.apply()

# Add project root to path to find scripts/
sys.path.append(os.path.abspath(".."))

# Load .env credentials
load_dotenv()

# ---[ Import fetch function ]---
from scripts.telegram_scraper import fetch_messages_async  # Make sure function is defined there

# ---[ Telegram Channels ]---
channels = [
    '@ZemenExpress',
    '@nevacomputer',
    '@meneshayeofficial',
    '@ethio_brand_collection',
    '@helloomarketethiopia',
    '@modernshoppingcenter',
    '@kuruwear'
]

# ---[ Fetch Messages ]---
all_dfs = []

for channel in tqdm(channels):
    try:
        print(f"📥 Fetching from: {channel}")
        df = await fetch_messages_async(channel, limit=300)
        print(f"✅ {channel} → {len(df)} messages")
        all_dfs.append(df)
    except Exception as e:
        print(f"❌ {channel} failed: {e}")

# ---[ Combine and Save Raw Data ]---
if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)

    os.makedirs("../data/raw", exist_ok=True)
    raw_data_path = "../data/raw/telegram_posts.csv"
    combined_df.to_csv(raw_data_path, index=False)

    # ---[ Clean Amharic Text ]---
    def clean_text(text):
        text = str(text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s፡።\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF]', '', text)
        return text.strip()

    combined_df['cleaned_text'] = combined_df['text'].fillna("").apply(clean_text)

    # ---[ Save Cleaned Data ]---
    os.makedirs("../data/processed", exist_ok=True)
    cleaned_data_path = "../data/processed/cleaned_telegram_posts.csv"
    combined_df.to_csv(cleaned_data_path, index=False)

    # ---[ Display Output ]---
    display(HTML("<h3>Cleaned Telegram Posts</h3>"))
    display(combined_df.head(10))

    (cleaned_data_path, raw_data_path)

else:
    raise ValueError("❌ All fetches failed — no data to save or process.")


  0%|          | 0/7 [00:00<?, ?it/s]

📥 Fetching from: @ZemenExpress


Invalid code. Please try again.


Signed in successfully as ClickHealth; remember to not break the ToS or you will risk an account ban!


 14%|█▍        | 1/7 [01:24<08:28, 84.74s/it]

✅ @ZemenExpress → 40 messages
📥 Fetching from: @nevacomputer


 29%|██▊       | 2/7 [01:25<02:56, 35.22s/it]

✅ @nevacomputer → 32 messages
📥 Fetching from: @meneshayeofficial


 43%|████▎     | 3/7 [01:26<01:18, 19.59s/it]

✅ @meneshayeofficial → 92 messages
📥 Fetching from: @ethio_brand_collection


 57%|█████▋    | 4/7 [01:26<00:36, 12.11s/it]

✅ @ethio_brand_collection → 99 messages
📥 Fetching from: @helloomarketethiopia


 71%|███████▏  | 5/7 [01:27<00:15,  7.97s/it]

✅ @helloomarketethiopia → 76 messages
📥 Fetching from: @modernshoppingcenter


 86%|████████▌ | 6/7 [01:28<00:05,  5.45s/it]

✅ @modernshoppingcenter → 25 messages
📥 Fetching from: @kuruwear


100%|██████████| 7/7 [01:28<00:00, 12.68s/it]

✅ @kuruwear → 74 messages





Unnamed: 0,channel,id,date,text,views,media,cleaned_text
0,@ZemenExpress,7004,2025-06-23 14:55:46+00:00,💥💥👀 ...........💥💥\n\n📌 Electric Charcoal Burne...,1760,True,Electric Charcoal Burner በቀላሉ ከሰል ለማያያዝ የሚሆን ...
1,@ZemenExpress,7000,2025-06-23 14:55:40+00:00,💥💥👀 ...........💥💥\n\n📌 Electric Charcoal Burne...,1458,True,Electric Charcoal Burner በቀላሉ ከሰል ለማያያዝ የሚሆን ...
2,@ZemenExpress,6999,2025-06-23 14:55:30+00:00,💥💥👀 ...........💥💥\n\n📌 Electric Charcoal Burne...,1463,True,Electric Charcoal Burner በቀላሉ ከሰል ለማያያዝ የሚሆን ...
3,@ZemenExpress,6995,2025-06-23 08:23:14+00:00,💥💥...................................💥💥\n\n📌Fo...,2374,True,Food mould Tool ስድስት አይነት ቅርጽ ዋጋ፦ 700 ብር ውስን ...
4,@ZemenExpress,6991,2025-06-21 16:35:51+00:00,💥💥...................................💥💥\n\n📌Sa...,3185,True,Saachi Electric Kettle Borosilicate Glass Body...
5,@ZemenExpress,6987,2025-06-21 08:07:31+00:00,💥💥...................................💥💥\n\n3pc...,3160,True,3pcs Bottle Stopper በማንኛውም ጠርሙስ ጫፍ የሚገጠም ለዘይት ...
6,@ZemenExpress,6986,2025-06-21 08:07:11+00:00,💥💥...................................💥💥\n\n3pc...,2750,True,3pcs Bottle Stopper በማንኛውም ጠርሙስ ጫፍ የሚገጠም ለዘይት ...
7,@ZemenExpress,6985,2025-06-21 05:42:46+00:00,💥💥...................................💥💥\n\n📌1 ...,2841,True,1 pairs Sneaker Crease Protector ዋጋ፦ 400 ብር ው...
8,@ZemenExpress,6983,2025-06-21 05:42:19+00:00,💥💥...................................💥💥\n\n📌1 ...,3139,True,1 pairs Sneaker Crease Protector ዋጋ፦ 400 ብር ው...
9,@ZemenExpress,6982,2025-06-18 06:01:10+00:00,💥💥...................................💥💥\n\n📌Im...,4582,True,Imitation Volcano Humidifier with LED Light በኤ...


Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer


In [4]:
import os
print(os.getcwd())  # This should end in /notebooks
print(os.path.exists("../scripts/telegram_scraper.py"))  # Should be True


/Users/jerus/Desktop/KAIM/ethio-ner-ecommerce-extractor/notebooks
True
