<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Telegram_Search_Source.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install -q telethon python-dotenv nest_asyncio

In [28]:
import asyncio
import json
import os
from telethon import TelegramClient, errors
from telethon.sessions import StringSession
from telethon.tl.types import (
    MessageMediaPhoto,
    MessageMediaWebPage,
    MessageMediaDocument,
    MessageMediaPoll,
    MessageMediaGiveaway,
    PeerChannel,
    PeerUser,
)
from dotenv import load_dotenv
import nest_asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load environment variables
load_dotenv()

# Telegram API Keys
TELEGRAM_API_ID = 20441646
TELEGRAM_API_HASH = "d78a891287e9ba6a2a8c4bb0e4ca506a"

In [29]:
# Telegram Channels to Scrape
TELEGRAM_CHANNELS = [
    "cveNotify",
    "ctinow",
    "CyberSecurityTechnologies",
    "cybersecurity_outlook",
    "cibsecurity",
    "thehackernews",
    "Cyber_Security_Channel",
    "cloudandcybersecurity",
    "androidMalware",
    "DarkfeedNews",
    "PentestingNews",
    "malwr",
    "vxunderground",
    "ransomwatcher",
    "KrebsOnSecurity",
    "SecurityAffairs",
    "CyberScoopNews",
    "CyberWire",
    "ThreatIntelligence",
    "CERTNews",
    "DataBreachToday",
    "InfoSecNews",
    "ZeroDay_TI",
    "OSINT_Tactical",
    "malware_traffic",
    "vulnerability_lab",
    "CyberDefenseMagazine",
    "CyberArk",
    "DarkWebMonitor",
]

In [30]:
# File to store the session string
SESSION_FILE = "telegram_session.txt"


async def generate_session_string():
    """Generate a new session string and save it to a file."""
    print("Generating a new session string...")
    async with TelegramClient(StringSession(), TELEGRAM_API_ID, TELEGRAM_API_HASH) as client:
        session_string = client.session.save()
        with open(SESSION_FILE, "w") as f:
            f.write(session_string)
        print(f"Session string saved to {SESSION_FILE}")
    return session_string


async def load_session_string():
    """Load the session string from the file. If the file doesn't exist, generate a new session string."""
    if os.path.exists(SESSION_FILE):
        with open(SESSION_FILE, "r") as f:
            return f.read().strip()
    else:
        return await generate_session_string()


async def start_telegram_client():
    """Start the Telegram client using the saved session string."""
    try:
        await telegram_client.connect()
        if not await telegram_client.is_user_authorized():
            print("Session is invalid. Please log in again.")
            await telegram_client.start(phone=lambda: input("Please enter your phone number: "))
            if not await telegram_client.is_user_authorized():
                await telegram_client.send_code_request(input("Please enter your phone number: "))
                await telegram_client.sign_in(input("Please enter the code you received: "))
            # Save the new session string
            with open(SESSION_FILE, "w") as f:
                f.write(telegram_client.session.save())
            print("New session string saved.")
        print("Telegram client started successfully")
    except Exception as e:
        print(f"Error starting Telegram client: {e}")


async def check_channel_validity(channel_name: str):
    """Check if a Telegram channel is valid and accessible."""
    try:
        await telegram_client.get_entity(channel_name)
        return True
    except Exception as e:
        print(f"Channel {channel_name} is invalid: {e}")
        return False


async def scrape_telegram_channel(channel_name: str):
    """Scrape messages from a Telegram channel."""
    try:
        print(f"Scraping channel: {channel_name}")
        messages = []
        async for message in telegram_client.iter_messages(channel_name, limit=10):  # Limit to 10 messages for testing
            try:
                # Extract sender information
                sender = None
                if message.sender_id:
                    try:
                        sender = await telegram_client.get_entity(message.sender_id)
                        sender = {
                            "id": sender.id,
                            "username": sender.username,
                            "first_name": getattr(sender, "first_name", None),
                            "last_name": getattr(sender, "last_name", None),
                        }
                    except Exception as e:
                        print(f"Could not fetch sender details: {e}")

                # Extract media details
                media_urls = []
                media_content = []
                if hasattr(message, "media") and message.media:
                    if isinstance(message.media, MessageMediaPhoto):
                        media_path = await telegram_client.download_media(
                            message.media, file=f"./media/{channel_name}/{message.id}.jpg"
                        )
                        media_urls.append(media_path)
                        media_content.append({"type": "photo", "url": media_path})
                    elif isinstance(message.media, MessageMediaWebPage):
                        if hasattr(message.media.webpage, "url"):
                            media_urls.append(message.media.webpage.url)
                            media_content.append({"type": "webpage", "url": message.media.webpage.url})
                    elif isinstance(message.media, MessageMediaDocument):
                        media_path = await telegram_client.download_media(
                            message.media, file=f"./media/{channel_name}/{message.id}.doc"
                        )
                        media_urls.append(media_path)
                        media_content.append({"type": "document", "url": media_path})
                    elif isinstance(message.media, MessageMediaPoll):
                        media_content.append({"type": "poll", "question": message.media.poll.question})
                    elif isinstance(message.media, MessageMediaGiveaway):
                        media_content.append({"type": "giveaway", "details": "Giveaway details here"})
                    else:
                        print(f"Unhandled media type: {type(message.media)}")

                # Extract hashtags and mentions
                hashtags = [word for word in message.message.split() if word.startswith("#")]
                mentions = [word for word in message.message.split() if word.startswith("@")]

                # Extract reactions
                reactions = []
                if message.reactions:
                    reactions = [{"emoticon": reaction.emoticon, "count": reaction.count} for reaction in message.reactions.results]

                # Extract message link
                message_link = f"https://t.me/{channel_name}/{message.id}"

                # Append message data
                messages.append(
                    {
                        "id": message.id,
                        "date": message.date.isoformat(),
                        "message": message.message,
                        "sender": sender,
                        "media_urls": media_urls,
                        "media_content": media_content,
                        "views": message.views,
                        "forwards": message.forwards,
                        "hashtags": hashtags,
                        "mentions": mentions,
                        "reactions": reactions,
                        "message_link": message_link,
                    }
                )
            except errors.FloodWaitError as e:
                print(f"Rate limit hit. Waiting for {e.seconds} seconds...")
                await asyncio.sleep(e.seconds)
                continue
            except Exception as e:
                print(f"Error processing message {message.id}: {e}")
                continue

        return messages
    except Exception as e:
        print(f"Error scraping channel {channel_name}: {e}")
        return []

In [31]:
async def scrape_all_channels(channels: list):
    """Scrape messages from all specified Telegram channels."""
    valid_channels = []
    for channel in channels:
        if await check_channel_validity(channel):
            valid_channels.append(channel)
        else:
            print(f"Skipping invalid channel: {channel}")

    tasks = [scrape_telegram_channel(channel) for channel in valid_channels]
    return await asyncio.gather(*tasks)


def save_to_json(data, filename):
    """Save scraped data to a JSON file."""
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)
    print(f"Data saved to {filename}")


def print_sample_data(data):
    """Print the first few data points from each channel."""
    for channel_data in data:
        if channel_data:
            print(f"Sample data from channel: {channel_data[0].get('channel', 'Unknown')}")
            for message in channel_data[:5]:  # Print first 5 messages
                print(json.dumps(message, indent=4))
                print("-" * 40)


def get_telegram_channel_data(channels: list):
    """Get data from all specified Telegram channels."""
    all_telegram_data = asyncio.run(scrape_all_channels(channels))
    return all_telegram_data

In [32]:
# Main execution
if __name__ == "__main__":
    # Ensure media directory exists
    os.makedirs("./media", exist_ok=True)
    for channel in TELEGRAM_CHANNELS:
        os.makedirs(f"./media/{channel}", exist_ok=True)

    # Initialize Telegram client with StringSession
    session_string = asyncio.run(load_session_string())
    telegram_client = TelegramClient(StringSession(session_string), TELEGRAM_API_ID, TELEGRAM_API_HASH)

    # Start the Telegram client
    asyncio.run(start_telegram_client())

    # Get data from all specified Telegram channels
    all_telegram_data = get_telegram_channel_data(TELEGRAM_CHANNELS)

    # Flatten the list of lists
    all_telegram_data = [item for sublist in all_telegram_data for item in sublist]

    # Save data to JSON
    save_to_json(all_telegram_data, "telegram_data.json")

    # Print sample data
    print_sample_data(all_telegram_data)

    print("Scraping completed successfully")

Telegram client started successfully
Channel SecurityAffairs is invalid: No user has "securityaffairs" as username
Skipping invalid channel: SecurityAffairs
Channel CyberScoopNews is invalid: No user has "cyberscoopnews" as username
Skipping invalid channel: CyberScoopNews
Channel ZeroDay_TI is invalid: No user has "zeroday_ti" as username
Skipping invalid channel: ZeroDay_TI
Channel malware_traffic is invalid: No user has "malware_traffic" as username
Skipping invalid channel: malware_traffic
Channel CyberDefenseMagazine is invalid: No user has "cyberdefensemagazine" as username
Skipping invalid channel: CyberDefenseMagazine
Scraping channel: cveNotify
Scraping channel: ctinow
Scraping channel: CyberSecurityTechnologies
Scraping channel: cybersecurity_outlook
Scraping channel: cibsecurity
Scraping channel: thehackernews
Scraping channel: Cyber_Security_Channel
Scraping channel: cloudandcybersecurity
Scraping channel: androidMalware
Scraping channel: DarkfeedNews
Scraping channel: Pent

KeyError: 0