<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Telegram_Search_Source.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install -q telethon python-dotenv nest_asyncio

In [22]:
import asyncio
import json
import os
import logging
from telethon import TelegramClient, errors
from telethon.sessions import StringSession
from telethon.tl.types import (
    MessageMediaPhoto,
    MessageMediaWebPage,
    MessageMediaDocument,
    MessageMediaPoll,
    MessageMediaGiveaway,
)
from dotenv import load_dotenv
import nest_asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load environment variables
load_dotenv()

# Telegram API Keys
TELEGRAM_API_ID = 20441646  # Replace with your API ID
TELEGRAM_API_HASH = "d78a891287e9ba6a2a8c4bb0e4ca506a"  # Replace with your API Hash

In [23]:
# Telegram Channels to Scrape
TELEGRAM_CHANNELS = [
    "cveNotify",
    "ctinow",
    "CyberSecurityTechnologies",
    "cybersecurity_outlook",
    "cibsecurity",
    "thehackernews",
    "Cyber_Security_Channel",
    "cloudandcybersecurity",
    "androidMalware",
    "DarkfeedNews",
    "PentestingNews",
    "malwr",
    "vxunderground",
    "ransomwatcher",
    "KrebsOnSecurity",
    "SecurityAffairs",
    "CyberScoopNews",
    "CyberWire",
    "ThreatIntelligence",
    "CERTNews",
    "DataBreachToday",
    "InfoSecNews",
    "ZeroDay_TI",
    "OSINT_Tactical",
    "malware_traffic",
    "vulnerability_lab",
    "CyberDefenseMagazine",
    "CyberArk",
    "DarkWebMonitor",
]

In [24]:
# File to store the session string
SESSION_FILE = "telegram_session.txt"

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


async def generate_session_string():
    """Generate a new session string and save it to a file."""
    logger.info("Generating a new session string...")
    async with TelegramClient(StringSession(), TELEGRAM_API_ID, TELEGRAM_API_HASH) as client:
        session_string = client.session.save()
        with open(SESSION_FILE, "w") as f:
            f.write(session_string)
        logger.info(f"Session string saved to {SESSION_FILE}")
    return session_string


async def load_session_string():
    """Load the session string from the file. If the file doesn't exist, generate a new session string."""
    if os.path.exists(SESSION_FILE):
        with open(SESSION_FILE, "r") as f:
            return f.read().strip()
    else:
        return await generate_session_string()


async def start_telegram_client():
    """Start the Telegram client using the saved session string."""
    try:
        await telegram_client.connect()
        if not await telegram_client.is_user_authorized():
            logger.info("Session is invalid. Please log in again.")
            await telegram_client.start(phone=lambda: input("Please enter your phone number: "))
            if not await telegram_client.is_user_authorized():
                await telegram_client.send_code_request(input("Please enter your phone number: "))
                await telegram_client.sign_in(input("Please enter the code you received: "))
            # Save the new session string
            with open(SESSION_FILE, "w") as f:
                f.write(telegram_client.session.save())
            logger.info("New session string saved.")
        logger.info("Telegram client started successfully")
    except Exception as e:
        logger.error(f"Error starting Telegram client: {e}")


async def check_channel_validity(channel_name: str):
    """Check if a Telegram channel is valid and accessible."""
    try:
        await telegram_client.get_entity(channel_name)
        return True
    except Exception as e:
        logger.error(f"Channel {channel_name} is invalid: {e}")
        return False


async def scrape_telegram_channel(channel_name: str):
    """Scrape messages from a Telegram channel."""
    try:
        logger.info(f"Scraping channel: {channel_name}")
        messages = []
        async for message in telegram_client.iter_messages(channel_name, limit=100):  # Limit to 100 messages
            try:
                media_urls = []
                media_content = []
                if hasattr(message, "media") and message.media:
                    if isinstance(message.media, MessageMediaPhoto):
                        media_path = await telegram_client.download_media(
                            message.media, file=f"./media/{channel_name}/{message.id}.jpg"
                        )
                        media_urls.append(media_path)
                        media_content.append({"image_url": media_path})
                    elif isinstance(message.media, MessageMediaWebPage):
                        if hasattr(message.media.webpage, "url"):
                            media_urls.append(message.media.webpage.url)
                            media_content.append({"webpage_url": message.media.webpage.url})
                    elif isinstance(message.media, MessageMediaDocument):
                        media_path = await telegram_client.download_media(
                            message.media, file=f"./media/{channel_name}/{message.id}.doc"
                        )
                        media_urls.append(media_path)
                        media_content.append({"document_url": media_path})
                    elif isinstance(message.media, MessageMediaPoll):
                        media_content.append({"poll": message.media.poll})
                    elif isinstance(message.media, MessageMediaGiveaway):
                        media_content.append({"giveaway": "Giveaway details here"})
                    else:
                        logger.warning(f"Unhandled media type: {type(message.media)}")

                messages.append(
                    {
                        "id": message.id,
                        "date": message.date.isoformat(),
                        "message": message.message,
                        "media_urls": media_urls,
                        "media_content": media_content,
                        "views": message.views,
                        "forwards": message.forwards,
                    }
                )
            except errors.FloodWaitError as e:
                logger.warning(f"Rate limit hit. Waiting for {e.seconds} seconds...")
                await asyncio.sleep(e.seconds)
                continue
            except Exception as e:
                logger.error(f"Error processing message {message.id}: {e}")
                continue

        return messages
    except Exception as e:
        logger.error(f"Error scraping channel {channel_name}: {e}")
        return []

In [25]:
async def scrape_all_channels(channels: list):
    """Scrape messages from all specified Telegram channels."""
    valid_channels = []
    for channel in channels:
        if await check_channel_validity(channel):
            valid_channels.append(channel)
        else:
            logger.warning(f"Skipping invalid channel: {channel}")

    tasks = [scrape_telegram_channel(channel) for channel in valid_channels]
    return await asyncio.gather(*tasks)


def save_to_json(data, filename):
    """Save scraped data to a JSON file."""
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)
    logger.info(f"Data saved to {filename}")


def get_telegram_channel_data(channels: list):
    """Get data from all specified Telegram channels."""
    all_telegram_data = asyncio.run(scrape_all_channels(channels))
    return all_telegram_data

In [None]:
# Main execution
if __name__ == "__main__":
    # Ensure media directory exists
    os.makedirs("./media", exist_ok=True)
    for channel in TELEGRAM_CHANNELS:
        os.makedirs(f"./media/{channel}", exist_ok=True)

    # Initialize Telegram client with StringSession
    session_string = asyncio.run(load_session_string())
    telegram_client = TelegramClient(StringSession(session_string), TELEGRAM_API_ID, TELEGRAM_API_HASH)

    # Start the Telegram client
    asyncio.run(start_telegram_client())

    # Get data from all specified Telegram channels
    all_telegram_data = get_telegram_channel_data(TELEGRAM_CHANNELS)

    # Flatten the list of lists
    all_telegram_data = [item for sublist in all_telegram_data for item in sublist]

    # Save data to JSON
    save_to_json(all_telegram_data, "telegram_data.json")

    logger.info("Scraping completed successfully")

ERROR:__main__:Channel SecurityAffairs is invalid: No user has "securityaffairs" as username
ERROR:__main__:Channel CyberScoopNews is invalid: No user has "cyberscoopnews" as username
ERROR:__main__:Channel ZeroDay_TI is invalid: No user has "zeroday_ti" as username
ERROR:__main__:Channel malware_traffic is invalid: No user has "malware_traffic" as username
ERROR:__main__:Channel CyberDefenseMagazine is invalid: No user has "cyberdefensemagazine" as username
ERROR:__main__:Error processing message 4432: Request was unsuccessful 6 time(s)
