<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [1]:
!pip install -qU yt-dlp aiohttp decord pandas pillow soundfile tqdm crawl4ai[all] nest_asyncio playwright

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import nest_asyncio
nest_asyncio.apply()

import os
import asyncio
import aiohttp
import json
import subprocess
from datetime import datetime

import pandas as pd
import yt_dlp
from tqdm.asyncio import tqdm
from playwright.async_api import async_playwright

# Import crawl4ai components
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

#########################################
# Setup Directories and Global Variables
#########################################
BASE_DIR = os.getcwd()
SCRAPED_URLS_DIR = os.path.join(BASE_DIR, 'scraped_urls')
IMAGES_DIR      = os.path.join(BASE_DIR, 'deepfake_images')
VIDEOS_DIR      = os.path.join(BASE_DIR, 'deepfake_videos')
AUDIOS_DIR      = os.path.join(BASE_DIR, 'deepfake_audios')
METADATA_FILE   = os.path.join(BASE_DIR, 'download_metadata.json')

for d in [SCRAPED_URLS_DIR, IMAGES_DIR, VIDEOS_DIR, AUDIOS_DIR]:
    os.makedirs(d, exist_ok=True)

# Global list to hold download metadata
download_metadata = []

In [3]:
#########################################
# Helper: Ensure Playwright Browsers Are Installed
#########################################
def install_playwright_browsers():
    try:
        print("[SYSTEM] Installing Playwright browsers...")
        subprocess.run(["playwright", "install"], check=True)
        print("[SYSTEM] Playwright browsers installed.")
    except Exception as e:
        print(f"[SYSTEM] Error installing Playwright browsers: {e}")

In [4]:
#########################################
# 1. Define URL Scraping Functions Using crawl4ai
#########################################
async def scrape_media_urls(modality, page_url, parse_function, output_filename):
    """
    Scrape media URLs from a given page URL using crawl4ai's AsyncWebCrawler and a CSS extraction strategy.
    Saves the scraped URLs as a CSV file and returns a list of unique URLs.
    """
    print(f"[{modality.upper()}] Scraping URLs from {page_url}")

    # Define a simple extraction schema.
    schema = {"extracted": list}

    extraction_strategy = JsonCssExtractionStrategy(
        parse_function=parse_function,
        schema=schema
    )

    crawler = AsyncWebCrawler(start_urls=[page_url], extraction_strategy=extraction_strategy)

    try:
        # Use the new method `start()` (adjust if your crawl4ai version differs)
        results = await crawler.start()
    except Exception as e:
        print(f"[{modality.upper()}] Error during crawling {page_url}: {e}")
        results = []

    scraped_urls = []
    for result in results:
        scraped_urls.extend(result.get('extracted', []))

    # Remove duplicates and empty values
    scraped_urls = list({url for url in scraped_urls if url})

    # Save URLs to CSV for future reference
    df = pd.DataFrame({'url': scraped_urls})
    df.to_csv(output_filename, index=False)
    print(f"[{modality.upper()}] Saved {len(scraped_urls)} URLs to {output_filename}")
    return scraped_urls

# Define parse functions (adjust CSS selectors to match your target deepfake pages)
def parse_image_links(response):
    # Example: extract src attribute from images with class "deepfake-img"
    return response.css("img.deepfake-img::attr(src)").getall()

def parse_video_links(response):
    # Example: extract href attribute from anchors with class "deepfake-video"
    return response.css("a.deepfake-video::attr(href)").getall()

def parse_audio_links(response):
    # Example: extract src attribute from audio tags with class "deepfake-audio"
    return response.css("audio.deepfake-audio::attr(src)").getall()

In [5]:
#########################################
# 2. Define Functions to Load URLs (with Fallbacks)
#########################################
def load_scraped_urls(modality):
    filepath = os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv")
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        urls = df['url'].dropna().tolist()
        print(f"[{modality.upper()}] Loaded {len(urls)} URLs from {filepath}")
        return urls
    else:
        print(f"[{modality.upper()}] File {filepath} not found. Using fallback method.")
        return []

# Fallback generators for deepfake-specific media
def fallback_image_urls(n=20):
    # Create diverse deepfake-specific image URLs by seeding with 'deepfake'
    return [f"https://picsum.photos/seed/deepfake{i}/600/400" for i in range(n)]

def fallback_video_urls(n=20):
    # Leave empty to trigger yt-dlp's search fallback, which uses a deepfake compilation query.
    return []

def fallback_audio_urls(n=20):
    # Leave empty so that we attempt extraction using Playwright fallback.
    return []

In [6]:
#########################################
# 3. Define Asynchronous Download Functions with Retries & Metadata Logging
#########################################
async def download_file(session, url, filename, modality, retries=3):
    """
    Download a file (image or audio) asynchronously using aiohttp with retry logic.
    Saves the file and logs metadata.
    """
    for attempt in range(1, retries + 1):
        try:
            async with session.get(url) as resp:
                if resp.status == 200:
                    content = await resp.read()
                    with open(filename, "wb") as f:
                        f.write(content)
                    print(f"[{modality.upper()}] Downloaded: {filename}")
                    download_metadata.append({
                        "modality": modality,
                        "filename": filename,
                        "url": url,
                        "download_time": datetime.now().isoformat()
                    })
                    return
                else:
                    print(f"[{modality.upper()}] Attempt {attempt}: Failed to download {url} (Status {resp.status})")
        except Exception as e:
            print(f"[{modality.upper()}] Attempt {attempt}: Exception downloading {url}: {e}")
        await asyncio.sleep(1)
    print(f"[{modality.upper()}] Failed to download {url} after {retries} attempts.")

async def download_images(urls):
    print("[IMAGE] Starting asynchronous image downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(IMAGES_DIR, f"deepfake_image_{idx+1:02d}.jpg")
            tasks.append(download_file(session, url, filename, modality="image"))
        await asyncio.gather(*tasks)

async def download_audios(urls):
    if not urls:
        print("[AUDIO] No audio URLs provided for download.")
        return
    print("[AUDIO] Starting asynchronous audio downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(AUDIOS_DIR, f"deepfake_audio_{idx+1:02d}.mp3")
            tasks.append(download_file(session, url, filename, modality="audio"))
        await asyncio.gather(*tasks)

In [7]:
#########################################
# 4. Define Synchronous Video Download Function Using yt-dlp
#########################################
def download_videos(video_urls, num_videos=20):
    """
    Download videos using yt-dlp. If no scraped URLs exist, use yt-dlp search fallback.
    The fallback query is tuned to deepfake compilations.
    """
    if video_urls:
        download_list = video_urls[:num_videos]
        print("[VIDEO] Downloading videos from scraped URLs...")
        for url in download_list:
            ydl_opts = {
                'format': 'bestvideo+bestaudio/best',
                'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
                'merge_output_format': 'mp4',
                'quiet': True,
                'no_warnings': True,
            }
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                try:
                    ydl.download([url])
                    download_metadata.append({
                        "modality": "video",
                        "filename": url,  # You may wish to parse actual filename later.
                        "url": url,
                        "download_time": datetime.now().isoformat()
                    })
                    print(f"[VIDEO] Downloaded: {url}")
                except Exception as e:
                    print(f"[VIDEO] Error downloading {url}: {e}")
    else:
        # Use yt-dlp fallback search for deepfake compilations
        search_query = "ytsearch20:deepfake compilation"
        ydl_opts = {
            'format': 'bestvideo+bestaudio/best',
            'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
            'merge_output_format': 'mp4',
            'quiet': True,
            'no_warnings': True,
        }
        print("[VIDEO] No scraped video URLs found; using yt-dlp search fallback...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl.download([search_query])
                print("[VIDEO] Video downloads (fallback search) complete.")
            except Exception as e:
                print(f"[VIDEO] Error with yt-dlp search fallback: {e}")

In [8]:
#########################################
# 5. Define Audio Extraction Using Playwright as a Fallback
#########################################
async def extract_audio_links():
    """
    Use Playwright to extract deepfake-specific audio URLs from a dynamic page.
    Update the URL and CSS selectors to match a real deepfake audio source.
    """
    print("[AUDIO] Extracting audio URLs via Playwright fallback...")
    extracted_links = []
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()
            # Update this URL to a site known for synthetic or deepfake audio samples
            url = "https://uberduck.ai/explore"  # Example URL; may need replacement
            await page.goto(url)
            await page.wait_for_timeout(5000)
            # Adjust the selector based on the site’s structure
            audio_elements = await page.query_selector_all("audio")
            for el in audio_elements:
                src = await el.get_attribute("src")
                if src:
                    extracted_links.append(src)
            await browser.close()
            print(f"[AUDIO] Extracted {len(extracted_links)} audio URLs via Playwright.")
    except Exception as e:
        if "Executable doesn't exist" in str(e):
            install_playwright_browsers()
        print(f"[AUDIO] Error during Playwright audio extraction: {e}")
    return extracted_links

In [9]:
#########################################
# 6. Save Download Metadata to a JSON File
#########################################
def save_download_metadata(metadata_list, filename=METADATA_FILE):
    try:
        with open(filename, 'w') as f:
            json.dump(metadata_list, f, indent=4)
        print(f"[SYSTEM] Download metadata saved to {filename}")
    except Exception as e:
        print(f"[SYSTEM] Error saving download metadata: {e}")

In [10]:
#########################################
# 7. Main Pipeline
#########################################
async def main_pipeline():
    # --- Configuration: Define the source pages for each modality ---
    pages = {
        "images": "https://deepfakesampleimages.com/gallery",   # Replace with a real deepfake image page if available.
        "videos": "https://deepfakesamplevideos.com/collection",  # Replace with a real deepfake video page if available.
        "audio":  "https://deepfakesampleaudio.com/samples"       # Replace with a real deepfake audio page if available.
    }

    # --- Scrape URLs Using crawl4ai (if pages are accessible) ---
    image_urls = await scrape_media_urls("image", pages["images"], parse_image_links,
                                           os.path.join(SCRAPED_URLS_DIR, "image_urls.csv"))
    video_urls = await scrape_media_urls("video", pages["videos"], parse_video_links,
                                           os.path.join(SCRAPED_URLS_DIR, "video_urls.csv"))
    audio_urls = await scrape_media_urls("audio", pages["audio"], parse_audio_links,
                                           os.path.join(SCRAPED_URLS_DIR, "audio_urls.csv"))

    # --- Load URLs (if available); otherwise, use fallbacks ---
    if not image_urls:
        print("[IMAGE] No scraped image URLs; using deepfake fallback image URLs.")
        image_urls = fallback_image_urls(20)
    else:
        image_urls = image_urls[:20]

    if not video_urls:
        print("[VIDEO] No scraped video URLs; using deepfake fallback (yt-dlp search).")
        video_urls = fallback_video_urls(20)
    else:
        video_urls = video_urls[:20]

    if not audio_urls:
        print("[AUDIO] No scraped audio URLs; using Playwright fallback extraction.")
        audio_urls = fallback_audio_urls(20)
    else:
        audio_urls = audio_urls[:20]

    # --- Download Images Asynchronously ---
    await download_images(image_urls)

    # --- Download Videos Synchronously via yt-dlp ---
    download_videos(video_urls, num_videos=20)

    # --- Download Audios ---
    # If no audio URLs are available from scraping/fallback, try extracting using Playwright.
    if not audio_urls:
        audio_urls = await extract_audio_links()
    await download_audios(audio_urls)

    # --- Save Download Metadata ---
    save_download_metadata(download_metadata)
    print("[SYSTEM] All downloads complete. Check the respective directories for files.")

# Run the entire pipeline.
if __name__ == '__main__':
    asyncio.run(main_pipeline())

[IMAGE] Scraping URLs from https://deepfakesampleimages.com/gallery
[IMAGE] Error during crawling https://deepfakesampleimages.com/gallery: BrowserType.launch: Executable doesn't exist at /root/.cache/ms-playwright/chromium-1155/chrome-linux/chrome
╔════════════════════════════════════════════════════════════╗
║ Looks like Playwright was just installed or updated.       ║
║ Please run the following command to download new browsers: ║
║                                                            ║
║     playwright install                                     ║
║                                                            ║
║ <3 Playwright Team                                         ║
╚════════════════════════════════════════════════════════════╝
[IMAGE] Saved 0 URLs to /content/scraped_urls/image_urls.csv
[VIDEO] Scraping URLs from https://deepfakesamplevideos.com/collection
[VIDEO] Error during crawling https://deepfakesamplevideos.com/collection: BrowserType.launch: Executable doesn't e