## Task 1: Data Ingestion and  Data Preprocessing

In [None]:
%pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.0


In [13]:
import os
import json
from telethon import TelegramClient
from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument
from telethon.errors import SessionPasswordNeededError
import asyncio
from dotenv import load_dotenv
import pandas as pd

# ----------------------
# Instructions:
# 1. Register your app at https://my.telegram.org to get API_ID and API_HASH
# 2. Add your API credentials to a .env file (API_ID, API_HASH, PHONE)
# 3. Add your target channel usernames to the CHANNELS list
# ----------------------
load_dotenv()

API_ID = os.getenv('API_ID')
API_HASH = os.getenv('API_HASHH')
SESSION_NAME = 'amharic_ecommerce'
CHANNELS = [
    '@ZemenExpress',
    '@Leyueqa',
    '@Shewabrand',
    '@qnashcom',
    '@aradabrand2'
]  # Add at least 5 channels

OUTPUT_DIR = '../data/raw'
MEDIA_DIR = os.path.join(OUTPUT_DIR, 'media')
RAW_DATA_PATH = os.path.join(OUTPUT_DIR, 'raw_messages.jsonl')
CSV_DATA_PATH = os.path.join(OUTPUT_DIR, 'raw_messages.csv')

os.makedirs(MEDIA_DIR, exist_ok=True)

MAX_MESSAGES_PER_CHANNEL = 500  # Set your max limit here

client = TelegramClient(SESSION_NAME, API_ID, API_HASH)

async def download_media(message, media_dir):
    if message.media:
        if isinstance(message.media, MessageMediaPhoto):
            file_path = await message.download_media(file=media_dir)
            return file_path
        elif isinstance(message.media, MessageMediaDocument):
            file_path = await message.download_media(file=media_dir)
            return file_path
    return None

async def fetch_and_save_messages():
    await client.start()
    if await client.is_user_authorized() is False:
        try:
            await client.send_code_request(os.getenv('PHONE'))
            await client.sign_in(phone=input('Re-enter your phone number: '), code=input('Enter the code you received: '))
        except SessionPasswordNeededError:
            await client.sign_in(password=input('Two-step verification enabled. Enter your password: '))

    all_messages = []
    with open(RAW_DATA_PATH, 'w', encoding='utf-8') as outfile:
        for channel in CHANNELS:
            print(f'Fetching messages from {channel} (max {MAX_MESSAGES_PER_CHANNEL})...')
            count = 0
            async for message in client.iter_messages(channel, limit=MAX_MESSAGES_PER_CHANNEL):
                msg_data = {
                    'channel': channel,
                    'message_id': message.id,
                    'date': str(message.date),
                    'sender_id': message.sender_id,
                    'text': message.text,
                    'media_type': None,
                    'media_path': None,
                }
                if message.media:
                    media_path = await download_media(message, MEDIA_DIR)
                    msg_data['media_type'] = type(message.media).__name__
                    msg_data['media_path'] = media_path
                outfile.write(json.dumps(msg_data, ensure_ascii=False) + '\n')
                all_messages.append(msg_data)
                count += 1
                if count >= MAX_MESSAGES_PER_CHANNEL:
                    break
    print(f'All messages saved to {RAW_DATA_PATH}')

    # Save to CSV
    df = pd.DataFrame(all_messages)
    df.to_csv(CSV_DATA_PATH, index=False, encoding='utf-8-sig')
    print(f'All messages also saved to {CSV_DATA_PATH}')

if __name__ == '__main__':
    import nest_asyncio
    nest_asyncio.apply()
    asyncio.run(fetch_and_save_messages())


Please enter your phone (or bot token): +251924788675
Please enter the code you received: 31108
Please enter your password: ··········
Signed in successfully as Petros; remember to not break the ToS or you will risk an account ban!
Fetching messages from @ZemenExpress (max 500)...
Fetching messages from @Leyueqa (max 500)...
Fetching messages from @Shewabrand (max 500)...
Fetching messages from @qnashcom (max 500)...
Fetching messages from @aradabrand2 (max 500)...
All messages saved to data/raw_messages.jsonl
All messages also saved to data/raw_messages.csv
