# Telegram Channel Scraper

This notebook scrapes messages from a list of specified Telegram channels and saves them into individual CSV files.

## 1. Setup: Install and Import Libraries

First, ensure you have the necessary libraries installed. Then, import them.

In [None]:
# Install required libraries (run this cell once)
!pip install telethon python-dotenv asyncio

In [None]:
import asyncio
from telethon import TelegramClient, events, sync, errors
from telethon.tl.functions.messages import GetDialogsRequest
from telethon.tl.types import InputPeerEmpty
import csv
import os
import logging

# Optional: Configure logging for better debugging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## 2. Configuration and Authentication

Load API credentials and authenticate the Telegram client. You might need to enter a code sent to your Telegram account the first time you run this.

In [None]:


api_id= "YOUR_API_ID"
api_hash= "YOUR_API_HASH"
phone= "YOUR_PHONE_NUMBER_WITH_COUNTRY_CODE"

# Check if variables are loaded
if not all([api_id, api_hash, phone]):
    raise ValueError("Please ensure api_id, api_hash, and phone are set in your .env file.")

api_id = int(api_id) # Convert api_id to integer
session_name = "my_telegram_session" # Use a unique session name 

# Create the client instance (asynchronous)
client = TelegramClient(session_name, api_id, api_hash)

async def authenticate_client():
    """Connects and authenticates the Telegram client."""
    print("Connecting to Telegram...")
    await client.connect()
    
    if not await client.is_user_authorized():
        print("First time login or session expired. Sending code...")
        await client.send_code_request(phone)
        try:
            await client.sign_in(phone, input('Enter the code sent to your Telegram: '))
            print("Signed in successfully!")
        except errors.SessionPasswordNeededError:
            await client.sign_in(password=input('Two-step verification password needed: '))
            print("Signed in successfully with 2FA!")
        except Exception as e:
            print(f"Error during sign in: {e}")
            await client.disconnect()
            raise # Reraise the exception to stop execution
    else:
        print("Client already authorized.")
    
    # Verify connection
    me = await client.get_me()
    print(f"Connected as: {me.first_name} (@{me.username})")

# Run the authentication (use await in a running asyncio loop, typical in Jupyter)
# If you are in an environment without top-level await, you might need:
# asyncio.run(authenticate_client())
# But usually Jupyter/IPython handles the event loop.
await authenticate_client()

## 3. Load Target Channel List

Specify the path to your text file containing comma-separated channel usernames or links, and load them into a list.

In [None]:
seed_file_path = 'channels.txt' # <-- IMPORTANT: Update this path if needed
channels = []

try:
    with open(seed_file_path, 'r', encoding='utf-8') as f:
        # Read the whole file, remove potential leading/trailing whitespace,
        # split by comma, and strip whitespace from each channel name.
        raw_content = f.read().strip()
        if raw_content: # Ensure file is not empty
             channels = [name.strip() for name in raw_content.split(',') if name.strip()]
    
    if not channels:
        print(f"Warning: No channel names found or file is empty at {seed_file_path}")
    else:
        print(f"Loaded {len(channels)} channel(s) to scrape:")
        print(channels)

except FileNotFoundError:
    print(f"Error: Seed file not found at '{seed_file_path}'. Please create it.")
    # Optionally create the directory structure if it doesn't exist
    os.makedirs(os.path.dirname(seed_file_path), exist_ok=True)
    print(f"Please create the file '{seed_file_path}' and add channel names, separated by commas.")
    channels = [] # Ensure channels list is empty if file not found
except Exception as e:
    print(f"An error occurred while reading the seed file: {e}")
    channels = []

## 4. Prepare Output Directory

Define the directory where the scraped CSV files will be saved and create it if it doesn't exist.

In [None]:
output_dir = 'export/'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print(f"Output directory '{output_dir}' ensured.")

## 5. Scraping Logic

Define the asynchronous function to iterate through the channels, fetch messages, and save them to CSV files. Includes error handling for inaccessible channels.

In [None]:
async def scrape_channels(client, channel_list, output_directory):
    """Scrapes messages from a list of channels and saves them to CSV files."""
    
    skipped_channels = []  # list to keep track of skipped channels
    processed_channels = 0

    if not await client.is_connected():
       print("Client seems disconnected. Attempting to reconnect...")
       await authenticate_client() # Try to re-authenticate 

    # Define CSV headers
    headers = [
        'channel_name', 'message_id', 'peer_id', 'date', 'message_text', 
        'mentioned', 'is_post', 'from_id', 'fwd_from', 'reply_to_msg_id', 
        'media_type', 'entities', 'views', 'forwards', 'replies_count', 
        'edit_date', 'post_author', 'grouped_id', 'reactions' # Consider Reaction details if needed
    ]

    for channel_name in channel_list:
        print(f"\n--- Processing channel: {channel_name} ---")
        try:
            # Add a small delay to avoid hitting rate limits too quickly
            await asyncio.sleep(2)
            
            # Resolve the channel entity (username, link, or ID)
            try:
                channel_entity = await client.get_entity(channel_name)
                print(f"Successfully found entity for {channel_name} (ID: {channel_entity.id})")
            except ValueError as ve:
                 print(f"Error resolving entity for '{channel_name}': {ve}. Might be an invalid username/link or not joined. Skipping.")
                 skipped_channels.append(channel_name)
                 continue # Skip to the next channel
            except errors.FloodWaitError as fwe:
                 print(f"Flood wait error for {channel_name}: waiting {fwe.seconds} seconds...")
                 await asyncio.sleep(fwe.seconds + 5) # Wait extra 5s
                 channel_entity = await client.get_entity(channel_name) # Retry getting entity
                 print(f"Successfully found entity for {channel_name} after wait.")
            except Exception as ge:
                 print(f"Unexpected error getting entity for {channel_name}: {ge}. Skipping.")
                 skipped_channels.append(channel_name)
                 continue

            # Sanitize channel name for filename
            safe_channel_name = "".join(c for c in channel_name if c.isalnum() or c in ('_', '-')).rstrip()
            if not safe_channel_name:
                safe_channel_name = f"channel_{channel_entity.id}" # Use ID if name is problematic
                
            output_file_path = os.path.join(output_directory, f"{safe_channel_name}.csv")
            print(f"Outputting to: {output_file_path}")

            message_count = 0
            with open(output_file_path, "w", encoding='UTF-8', newline='') as f:
                writer = csv.writer(f, delimiter=",", lineterminator="\n")
                writer.writerow(headers)

                # Iterate through messages (limit=None fetches all)
                # Use a smaller limit for testing, e.g., limit=100
                async for message in client.iter_messages(channel_entity, limit=None):
                    # Extract data safely, providing defaults
                    row_data = [
                        channel_name, 
                        message.id,
                        getattr(message.peer_id, 'channel_id', getattr(message.peer_id, 'chat_id', getattr(message.peer_id, 'user_id', ''))), # Get specific ID
                        message.date.isoformat() if message.date else '', # Use ISO format for dates
                        message.message if message.message else '',
                        message.mentioned,
                        message.post,
                        getattr(message.from_id, 'user_id', '') if message.from_id else '', # Get user ID if possible
                        str(message.fwd_from) if message.fwd_from else '', # Basic string representation
                        message.reply_to.reply_to_msg_id if message.reply_to else '',
                        type(message.media).__name__ if message.media else '', # Type of media
                        str(message.entities) if message.entities else '', # Basic string representation
                        message.views,
                        message.forwards,
                        message.replies.replies if message.replies else 0, # Get reply count
                        message.edit_date.isoformat() if message.edit_date else '',
                        message.post_author,
                        message.grouped_id,
                        str(message.reactions.results) if message.reactions else '' # Basic string representation of reactions
                    ]
                    writer.writerow(row_data)
                    message_count += 1
                    if message_count % 500 == 0:
                        print(f"  ... scraped {message_count} messages from {channel_name}")
            
            print(f'Finished scraping {message_count} messages from {channel_name}. Data saved to {output_file_path}')
            processed_channels += 1

        except (errors.ChannelInvalidError, errors.ChannelPrivateError, errors.ChatAdminRequiredError, errors.UserNotParticipantError) as e:
            print(f"Access Error: Could not access channel '{channel_name}': {type(e).__name__} - {str(e)}. Skipping.")
            skipped_channels.append(channel_name)
        except errors.FloodWaitError as e:
            print(f"Flood wait error for {channel_name}: waiting {e.seconds} seconds...")
            await asyncio.sleep(e.seconds + 5) # Wait and add buffer
            # Consider adding retry logic here or just skipping
            print(f"Retrying {channel_name} after wait...")
            # You might want to re-add the channel to the *end* of the list to try later
            # channel_list.append(channel_name) # Be careful not to create infinite loops
            skipped_channels.append(f"{channel_name} (FloodWait - try later)")
        except errors.AuthKeyError as e:
             print(f"Authentication error for {channel_name}: {e}. Session might be invalid. Stopping.")
             # You might need to re-authenticate or delete the session file and restart.
             raise # Stop execution
        except Exception as e:
            # Catch other potential errors like network issues, timeouts, etc.
            import traceback
            print(f"An unexpected error occurred with channel '{channel_name}': {type(e).__name__} - {str(e)}")
            # print(traceback.format_exc()) # Uncomment for detailed traceback
            skipped_channels.append(f"{channel_name} (Error: {type(e).__name__})")

    print(f"\n--- Scraping Complete ---")
    print(f"Successfully processed {processed_channels} channel(s).")

    # Print skipped channels at the end
    if skipped_channels:
        print("\nSkipped Channels/Errors:")
        for channel_info in skipped_channels:
            print(f"- {channel_info}")
    else:
        print("\nNo channels were skipped.")

## 6. Execute Scraping Process

Run the scraping function defined above. This will iterate through the loaded channels and perform the scraping.

In [None]:
# Ensure the client is connected before starting the main scraping task
async def run_scraper():
    if not channels: 
        print("No channels loaded. Skipping scraping process.")
        return
        
    if not client.is_connected():
        print("Client is not connected. Trying to authenticate again.")
        await authenticate_client()
        if not client.is_connected():
             print("Failed to connect/authenticate client. Aborting scrape.")
             return
    
    print("\nStarting the scraping process...")
    await scrape_channels(client, channels, output_dir)
    print("Scraping process finished.")

# Execute the main scraping task
# Again, use await directly if your environment supports top-level await.
# Otherwise, use asyncio.run() if needed in a pure Python script context.
await run_scraper()

## 7. Disconnect Client

It's good practice to disconnect the client when you're finished.

In [None]:
async def disconnect_client():
    if client.is_connected():
        print("\nDisconnecting the client...")
        await client.disconnect()
        print("Client disconnected.")
    else:
        print("\nClient is already disconnected.")

# Disconnect the client
await disconnect_client()