# Task 1: Data Scraping and Collection (Extract & Load)

This notebook demonstrates how to scrape messages and images from public Telegram channels related to Ethiopian medical businesses, save the raw data in a partitioned directory structure, and implement robust logging for reproducibility and traceability.

In [None]:
# Section 1: Import Required Libraries
import os
import json
import logging
from datetime import datetime
from telethon.sync import TelegramClient
from telethon.tl.types import MessageMediaPhoto
from dotenv import load_dotenv

In [None]:
# Section 2: Set Up Telegram API Credentials
# Load environment variables from .env file
load_dotenv()
API_ID = int(os.getenv("TELEGRAM_API_ID"))
API_HASH = os.getenv("TELEGRAM_API_HASH")

# List of target channels
CHANNELS = [
    "https://t.me/lobelia4cosmetics",
    "https://t.me/tikvahpharma",
     "https://t.me/CheMed123"  # Example: Chemed channel
]

In [None]:
# Section 3: Define Utility Functions for Scraping

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

# Function to scrape messages from a channel
def fetch_channel_messages(client, channel_url, limit=500):
    messages = []
    images = []
    for message in client.iter_messages(channel_url, limit=limit):
        msg = {
            "id": message.id,
            "date": str(message.date),
            "text": message.text,
            "media": bool(message.media),
        }
        messages.append(msg)
        # Collect image media
        if message.media and isinstance(message.media, MessageMediaPhoto):
            images.append(message)
    return messages, images

# Function to download images from messages
def download_images(client, images, out_dir):
    ensure_dir(out_dir)
    for msg in images:
        try:
            file_path = client.download_media(msg, file=out_dir)
        except Exception as e:
            logging.error(f"Failed to download image {msg.id}: {e}")

In [None]:
# Section 4: Scrape Messages from Multiple Telegram Channels
# Section 5: Download Images from Telegram Messages

RAW_DATA_DIR = "data/raw/telegram_messages"
IMAGE_DATA_DIR = "data/raw/telegram_images"

def scrape_and_save():
    with TelegramClient('anon', API_ID, API_HASH) as client:
        for channel_url in CHANNELS:
            channel_name = channel_url.split('/')[-1]
            date_str = datetime.now().strftime("%Y-%m-%d")
            out_dir = os.path.join(RAW_DATA_DIR, date_str)
            ensure_dir(out_dir)
            out_path = os.path.join(out_dir, f"{channel_name}.json")
            # Scrape messages and images
            messages, images = fetch_channel_messages(client, channel_url)
            # Save messages as JSON
            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(messages, f, ensure_ascii=False, indent=2)
            # Download images
            img_out_dir = os.path.join(IMAGE_DATA_DIR, date_str, channel_name)
            download_images(client, images, img_out_dir)
            logging.info(f"Scraped {len(messages)} messages and {len(images)} images from {channel_url}")

scrape_and_save()

## Saving Raw Data

All scraped messages are saved as JSON files in a partitioned directory structure: `data/raw/telegram_messages/YYYY-MM-DD/channel_name.json`. Images are saved under `data/raw/telegram_images/YYYY-MM-DD/channel_name/`. This structure supports incremental processing and easy data management.

In [None]:
# Section 7: Implement Logging for Scraping Process
LOG_FILE = os.path.join(RAW_DATA_DIR, "scraping.log")
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s %(levelname)s:%(message)s"
)
print(f"Logging to {LOG_FILE}")

In [None]:
# Section 8: Verify Directory and File Creation
import glob

def list_created_files():
    print("Created message files:")
    for f in glob.glob(os.path.join(RAW_DATA_DIR, "*", "*.json")):
        print(f)
    print("\nCreated image directories:")
    for d in glob.glob(os.path.join(IMAGE_DATA_DIR, "*", "*")):
        print(d)

list_created_files()