# Data scraping and collection pipeline


# Telegram Scraping

### Authenticate with Telegram API

In [1]:
import asyncio
import nest_asyncio
from telethon import TelegramClient

# Apply fix for nested event loops
nest_asyncio.apply()

api_id = 25116412  
api_hash = "3d271d7445279596e83ea75d79ff760d"
phone_number = "+251985535012"

client = TelegramClient("session_name", api_id, api_hash)

async def main():
    await client.start(phone_number)
    print("Client Authenticated")

# Run the async function safely
await main()  # Works inside Jupyter

Signed in successfully as Kibrom; remember to not break the ToS or you will risk an account ban!
Client Authenticated


### Scrape Data from Telegram Channels

In [2]:
from telethon import functions, types
import pandas as pd

# Assuming 'client' is already defined and started from your previous code
# Make sure 'client' is in the global scope or you pass it to functions properly

# Define scrape_channel function
async def scrape_channel(channel_username, client):
    try:
        channel = await client.get_entity(channel_username)
        messages = await client.get_messages(channel, limit=100)  # Adjust limit as needed
        data = []
        
        for message in messages:
            data.append({
                'date': message.date,
                'text': message.text if message.text else None,
                'media_type': type(message.media).__name__ if message.media else None,
            })
        
        return data
    except Exception as e:
        print(f"Error scraping channel {channel_username}: {e}")
        return []

# List of channels to scrape
channels = ['DoctorsET', 'Chemed', 'lobelia4cosmetics', 'yetenaweg', 'EAHCI']

# Scrape data from all channels
async def scrape_all_channels(client):
    all_data = []
    
    for channel in channels:
        scraped_data = await scrape_channel(channel, client)
        all_data.extend(scraped_data)

    # Convert to DataFrame
    df = pd.DataFrame(all_data)

    # Save to CSV
    df.to_csv('telegram_raw_data.csv', index=False)
    print(f"Scraped data from {len(channels)} channels and saved to 'telegram_raw_data.csv'.")

# Run the scraping function
# Make sure 'client' is accessible here
await scrape_all_channels(client)

Scraped data from 5 channels and saved to 'telegram_raw_data.csv'.


# Image Scraping for Object Detection


In [3]:
import os

async def download_images(channel_username, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    channel = await client.get_entity(channel_username)
    messages = await client.get_messages(channel, limit=50)  # Adjust limit as needed
    
    for i, message in enumerate(messages):
        if message.media:
            await message.download_media(file=os.path.join(save_dir, f'image_{i}.jpg'))

# Example: Download images from a channel
channel_username = 'lobelia4cosmetics'
save_directory = 'downloaded_images'
client.loop.run_until_complete(download_images(channel_username, save_directory))

# Storing Raw Data

### Moving to Local Database

In [4]:
import sqlite3
import pandas as pd

# Assuming data is already in a DataFrame
df = pd.read_csv('telegram_raw_data.csv')

conn = sqlite3.connect('telegram_data.db')
df.to_sql('raw_messages', conn, if_exists='replace', index=False)
conn.close()

# Monitoring and Logging

### Logging:

In [7]:
import logging
from datetime import datetime
import os

# Define the log file path explicitly
log_file_path = os.path.join('../notebooks', 'telegram_scraping_log_{}.log'.format(datetime.now().strftime('%Y%m%d_%H%M%S')))

# Configure logging
logging.basicConfig(
    filename=log_file_path,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Print the log file path for debugging
print("Log file will be created at:", log_file_path)

# Example usage in your scraping script
try:
    # Your scraping code here
    channel_name = 'DoctorsET'  # Example channel name
    logging.info('Scraping started for channel: %s', channel_name)
    # Simulate an error for testing
    # raise Exception("Test error")
except Exception as e:
    logging.error('Error occurred while scraping: %s', str(e))
finally:
    logging.info('Scraping process concluded')

Log file will be created at: ../notebooks\telegram_scraping_log_20250131_182257.log
