<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [1]:
!pip install -qU yt-dlp aiohttp decord pandas pillow soundfile tqdm crawl4ai[all] nest_asyncio playwright telethon

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m102.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m17.8 MB/s[0m eta [36

In [26]:
import os
import asyncio
import aiohttp
import json
import subprocess
from datetime import datetime
import pandas as pd
import yt_dlp
from tqdm.asyncio import tqdm
from playwright.async_api import async_playwright

# Import crawl4ai components (if available)
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

# Import Telethon for Telegram scraping
from telethon import TelegramClient

#########################################
# Setup Directories and Global Variables
#########################################
BASE_DIR = os.getcwd()
SCRAPED_URLS_DIR = os.path.join(BASE_DIR, 'scraped_urls')
IMAGES_DIR      = os.path.join(BASE_DIR, 'deepfake_images')
VIDEOS_DIR      = os.path.join(BASE_DIR, 'deepfake_videos')
AUDIOS_DIR      = os.path.join(BASE_DIR, 'deepfake_audios')
TEXT_DIR        = os.path.join(BASE_DIR, 'deepfake_texts')
TELEGRAM_DIR    = os.path.join(BASE_DIR, 'telegram_messages')
METADATA_FILE   = os.path.join(BASE_DIR, 'download_metadata.json')

for d in [SCRAPED_URLS_DIR, IMAGES_DIR, VIDEOS_DIR, AUDIOS_DIR, TEXT_DIR, TELEGRAM_DIR]:
    os.makedirs(d, exist_ok=True)

download_metadata = []  # Global metadata list

#########################################
# Helper: Ensure Playwright Browsers Are Installed
#########################################
def install_playwright_browsers():
    try:
        print("[SYSTEM] Installing Playwright browsers...")
        subprocess.run(["playwright", "install"], check=True)
        print("[SYSTEM] Playwright browsers installed.")
    except Exception as e:
        print(f"[SYSTEM] Error installing Playwright browsers: {e}")

In [27]:
#########################################
# 1. URL Scraping Functions using crawl4ai
#########################################
async def scrape_media_urls(modality, page_url, parse_function, output_filename):
    """
    Scrape media URLs from a given page URL using crawl4ai's AsyncWebCrawler.
    Saves URLs to CSV and returns a unique list.
    """
    print(f"[{modality.upper()}] Scraping URLs from {page_url}")
    schema = {"extracted": list}
    extraction_strategy = JsonCssExtractionStrategy(parse_function=parse_function, schema=schema)
    crawler = AsyncWebCrawler(start_urls=[page_url], extraction_strategy=extraction_strategy)
    try:
        results = await crawler.start()
    except Exception as e:
        print(f"[{modality.upper()}] Error crawling {page_url}: {e}")
        results = []

    scraped_urls = []
    for result in results:
        scraped_urls.extend(result.get('extracted', []))
    scraped_urls = list({url for url in scraped_urls if url})
    df = pd.DataFrame({'url': scraped_urls})
    df.to_csv(output_filename, index=False)
    print(f"[{modality.upper()}] Saved {len(scraped_urls)} URLs to {output_filename}")
    return scraped_urls

# Standard parse functions:
def parse_image_links(response):
    return response.css("img::attr(src)").getall()

def parse_video_links(response):
    return response.css("a::attr(href)").getall() or response.css("video::attr(src)").getall()

def parse_audio_links(response):
    return response.css("audio::attr(src)").getall()

def parse_text_links(response):
    return response.css("a::attr(href)").getall()

In [28]:
#########################################
# 2. Functions to Load URLs (with Fallbacks)
#########################################
def load_scraped_urls(modality):
    filepath = os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv")
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        urls = df['url'].dropna().tolist()
        print(f"[{modality.upper()}] Loaded {len(urls)} URLs from {filepath}")
        return urls
    else:
        print(f"[{modality.upper()}] File {filepath} not found. Using fallback method.")
        return []

# Fallback function that avoids general nature images by using a keyword for manipulated media
def fallback_image_urls(n=20, seed_keyword="deepfake"):
    return [f"https://picsum.photos/seed/{seed_keyword}{i}/600/400" for i in range(n)]

def fallback_video_urls(n=20):
    return []  # Let yt-dlp fallback handle this

def fallback_audio_urls(n=20):
    return []

def fallback_text_urls(n=20):
    sample_articles = [
        "https://www.theguardian.com/technology/2024/oct/16/its-not-me-its-just-my-face-the-models-who-found-their-likenesses-had-been-used-in-ai-propaganda",
        "https://www.wired.com/story/generative-ai-deepfakes/",
        "https://www.ft.com/content/7f22ce59-1c6c-4d84-bca8-dc539992e286",
        "https://nypost.com/2024/07/17/lifestyle/how-people-are-being-tricked-by-deepfake-doctor-videos/",
        "https://www.openfox.com/deepfakes-and-their-impact-on-society/"
    ]
    return (sample_articles * ((n // len(sample_articles)) + 1))[:n]

In [29]:
#########################################
# 3. Asynchronous Download Functions with Retry Logic
#########################################
async def download_file(session, url, filename, modality, retries=3):
    for attempt in range(1, retries + 1):
        try:
            async with session.get(url) as resp:
                if resp.status == 200:
                    content = await resp.read()
                    with open(filename, "wb") as f:
                        f.write(content)
                    print(f"[{modality.upper()}] Downloaded: {filename}")
                    download_metadata.append({
                        "modality": modality,
                        "filename": filename,
                        "url": url,
                        "download_time": datetime.now().isoformat()
                    })
                    return
                else:
                    print(f"[{modality.upper()}] Attempt {attempt}: Status {resp.status} for {url}")
        except Exception as e:
            print(f"[{modality.upper()}] Attempt {attempt}: Exception for {url}: {e}")
        await asyncio.sleep(1)
    print(f"[{modality.upper()}] Failed to download {url} after {retries} attempts.")

async def download_images(urls):
    print("[IMAGE] Starting image downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(IMAGES_DIR, f"deepfake_image_{idx+1:03d}.jpg")
            tasks.append(download_file(session, url, filename, modality="image"))
        await asyncio.gather(*tasks)

async def download_audios(urls):
    if not urls:
        print("[AUDIO] No audio URLs provided for download.")
        return
    print("[AUDIO] Starting audio downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(AUDIOS_DIR, f"deepfake_audio_{idx+1:03d}.mp3")
            tasks.append(download_file(session, url, filename, modality="audio"))
        await asyncio.gather(*tasks)

async def download_texts(urls):
    if not urls:
        print("[TEXT] No text URLs provided for download.")
        return
    print("[TEXT] Starting text downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(TEXT_DIR, f"deepfake_text_{idx+1:03d}.html")
            tasks.append(download_file(session, url, filename, modality="text"))
        await asyncio.gather(*tasks)

def download_videos(video_urls, num_videos=20):
    if video_urls:
        download_list = video_urls[:num_videos]
        print("[VIDEO] Downloading videos from scraped URLs...")
        for url in download_list:
            ydl_opts = {
                'format': 'bestvideo+bestaudio/best',
                'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
                'merge_output_format': 'mp4',
                'quiet': True,
                'no_warnings': True,
            }
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                try:
                    ydl.download([url])
                    download_metadata.append({
                        "modality": "video",
                        "filename": url,
                        "url": url,
                        "download_time": datetime.now().isoformat()
                    })
                    print(f"[VIDEO] Downloaded: {url}")
                except Exception as e:
                    print(f"[VIDEO] Error downloading {url}: {e}")
    else:
        search_query = "ytsearch20:deepfake compilation"
        ydl_opts = {
            'format': 'bestvideo+bestaudio/best',
            'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
            'merge_output_format': 'mp4',
            'quiet': True,
            'no_warnings': True,
        }
        print("[VIDEO] No scraped video URLs; using yt-dlp search fallback...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl.download([search_query])
                print("[VIDEO] Fallback video downloads complete.")
            except Exception as e:
                print(f"[VIDEO] yt-dlp search fallback error: {e}")

In [30]:
#########################################
# 4. Audio Extraction via Playwright (Fallback)
#########################################
async def extract_audio_links():
    print("[AUDIO] Extracting audio URLs via Playwright fallback...")
    extracted_links = []
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()
            url = "https://uberduck.ai/explore"
            await page.goto(url)
            await page.wait_for_timeout(5000)
            audio_elements = await page.query_selector_all("audio")
            for el in audio_elements:
                src = await el.get_attribute("src")
                if src:
                    extracted_links.append(src)
            await browser.close()
            print(f"[AUDIO] Extracted {len(extracted_links)} audio URLs via Playwright.")
    except Exception as e:
        if "Executable doesn't exist" in str(e):
            install_playwright_browsers()
        print(f"[AUDIO] Playwright extraction error: {e}")
    return extracted_links

In [31]:
#########################################
# 5. Telegram Scraping Using Telethon
#########################################
# Updated configuration with actual Telegram channel usernames.
# Ensure that you have valid API credentials and that these channels are accessible.
TELEGRAM_CONFIG = {
    "api_id": 123456,               # Replace with your actual Telegram API ID
    "api_hash": "YOUR_API_HASH",    # Replace with your actual Telegram API Hash
    "phone": "+111111111111",        # Replace with your phone number (with country code)
    "channels": [
        "deepfakesarchive",       # e.g., t.me/deepfakesarchive
        "deepfakes",              # e.g., t.me/deepfakes
        "fakemediaforensics"      # e.g., t.me/fakemediaforensics
    ]
}

async def scrape_telegram_messages(config):
    """
    Scrape messages from a list of Telegram channels using Telethon and save them to a CSV file.
    """
    print("[TELEGRAM] Starting Telegram scraping...")
    client = TelegramClient('telegram_session', config["api_id"], config["api_hash"])
    await client.start(phone=config["phone"])

    all_messages = []
    # Iterate over each channel in the provided list
    for channel_identifier in config["channels"]:
        try:
            channel = await client.get_entity(channel_identifier)
            print(f"[TELEGRAM] Scraping channel: {channel_identifier}")
            messages_data = []
            async for message in client.iter_messages(channel, limit=100):  # Adjust limit as needed
                messages_data.append({
                    "channel": channel_identifier,
                    "message_id": message.id,
                    "date": message.date.isoformat() if message.date else None,
                    "sender_id": message.sender_id,
                    "text": message.text
                })
            print(f"[TELEGRAM] Retrieved {len(messages_data)} messages from {channel_identifier}")
            all_messages.extend(messages_data)
        except Exception as e:
            print(f"[TELEGRAM] Error scraping {channel_identifier}: {e}")

    # Save the aggregated messages to a CSV file
    df = pd.DataFrame(all_messages)
    telegram_filename = os.path.join(TELEGRAM_DIR, "telegram_messages.csv")
    df.to_csv(telegram_filename, index=False, encoding='utf-8')
    print(f"[TELEGRAM] Saved {len(all_messages)} messages to {telegram_filename}")
    await client.disconnect()
    return all_messages

In [32]:
#########################################
# 6. Save Download Metadata to JSON
#########################################
def save_download_metadata(metadata_list, filename=METADATA_FILE):
    try:
        with open(filename, 'w') as f:
            json.dump(metadata_list, f, indent=4)
        print(f"[SYSTEM] Download metadata saved to {filename}")
    except Exception as e:
        print(f"[SYSTEM] Error saving metadata: {e}")

In [33]:
#########################################
# 7. Main Pipeline with Expanded Modalities (including diverse sources and Telegram channels)
#########################################
async def main_pipeline():
    # Define source pages for various modalities using publicly available forensic datasets.
    pages = {
        "politician_manipulated_images": "https://github.com/ondyari/FaceForensics",   # FaceForensics++ (manipulated faces)
        "celebrity_manipulated_videos": "https://sites.google.com/view/celebdf",         # Celeb-DF dataset page
        "human_manipulation_audios": "https://www.asvspoof.org/2019/index.html",         # ASVspoof 2019 dataset page
        "deepfake_voice_detection": "https://www.kaggle.com/datasets/search?query=deepfake+voice",
        "deepfake_video_detection": "https://www.kaggle.com/datasets/search?query=deepfake+video+detection",
        "ai_generated_images": "https://thiswaifudoesnotexist.net/"
    }

    # Process each modality:
    for modality, page_url in pages.items():
        if any(key in modality for key in ["image", "photoshop", "face", "ai_generated"]):
            urls = await scrape_media_urls(modality, page_url, parse_image_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        elif "video" in modality:
            urls = await scrape_media_urls(modality, page_url, parse_video_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        elif "audio" in modality or "voice" in modality:
            urls = await scrape_media_urls(modality, page_url, parse_audio_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        elif "text" in modality:
            urls = await scrape_media_urls(modality, page_url, parse_text_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        else:
            urls = []

        # Use fallback only if scraping yielded no URLs
        if not urls:
            print(f"[{modality.upper()}] No scraped URLs; using fallback.")
            if any(key in modality for key in ["image", "photoshop", "face", "ai_generated"]):
                seed = modality  # the modality itself implies manipulated media
                urls = fallback_image_urls(20, seed_keyword=seed)
            elif "video" in modality:
                urls = fallback_video_urls(20)
            elif "audio" in modality:
                urls = fallback_audio_urls(20)
            elif "text" in modality:
                urls = fallback_text_urls(20)
        else:
            urls = urls[:20]

        # Download files based on modality
        if any(key in modality for key in ["image", "photoshop", "face", "ai_generated"]):
            await download_images(urls)
        elif "video" in modality:
            download_videos(urls, num_videos=20)
        elif "audio" in modality or "voice" in modality:
            await download_audios(urls)
        elif "text" in modality:
            await download_texts(urls)
        else:
            print(f"[{modality.upper()}] Modality not recognized for downloading.")

    # Extra audio extraction via Playwright fallback
    extra_audio_urls = await extract_audio_links()
    if extra_audio_urls:
        await download_audios(extra_audio_urls)

    # ----- New Telegram Scraping Step -----
    print("[TELEGRAM] Scraping Telegram messages from target channels...")
    await scrape_telegram_messages(TELEGRAM_CONFIG)

    # Save all download metadata
    save_download_metadata(download_metadata)
    print("[SYSTEM] All downloads complete. Check the respective directories for files.")

#########################################
# Entry Point with Fixes for Running Event Loop
#########################################
import nest_asyncio
nest_asyncio.apply()

try:
    loop = asyncio.get_running_loop()
except RuntimeError:
    loop = None

if loop and loop.is_running():
    print("[SYSTEM] Detected running event loop. Using loop.run_until_complete()...")
    loop.run_until_complete(main_pipeline())
else:
    asyncio.run(main_pipeline())

[SYSTEM] Detected running event loop. Using loop.run_until_complete()...
[POLITICIAN_MANIPULATED_IMAGES] Scraping URLs from https://github.com/ondyari/FaceForensics
[INIT].... → Crawl4AI 0.4.248


TypeError: 'AsyncWebCrawler' object is not iterable