<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [18]:
!pip install -qU yt-dlp aiohttp decord pandas pillow soundfile tqdm crawl4ai[all] nest_asyncio playwright

In [19]:
# Import and apply nest_asyncio (useful in notebooks)
import nest_asyncio
nest_asyncio.apply()

# Standard libraries
import os
import asyncio
import aiohttp
import json
import time
from datetime import datetime

# Third-party libraries
import pandas as pd
import yt_dlp
from tqdm.asyncio import tqdm
from playwright.async_api import async_playwright

# crawl4ai components
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

#########################################
# Setup Directories and Global Variables
#########################################
BASE_DIR = os.getcwd()
SCRAPED_URLS_DIR = os.path.join(BASE_DIR, 'scraped_urls')
IMAGES_DIR      = os.path.join(BASE_DIR, 'deepfake_images')
VIDEOS_DIR      = os.path.join(BASE_DIR, 'deepfake_videos')
AUDIOS_DIR      = os.path.join(BASE_DIR, 'deepfake_audios')
METADATA_FILE   = os.path.join(BASE_DIR, 'download_metadata.json')

for d in [SCRAPED_URLS_DIR, IMAGES_DIR, VIDEOS_DIR, AUDIOS_DIR]:
    os.makedirs(d, exist_ok=True)

# Global list to hold download metadata
download_metadata = []

In [22]:
#########################################
# 1. Define URL Scraping Functions Using crawl4ai
#########################################
async def scrape_media_urls(modality, page_url, parse_function, output_filename):
    """
    Scrape media URLs from a given page URL using crawl4ai's AsyncWebCrawler and a CSS extraction strategy.
    Saves the scraped URLs as a CSV file and returns a list of unique URLs.
    """
    print(f"[{modality.upper()}] Scraping URLs from {page_url}")
    extraction_strategy = JsonCssExtractionStrategy(parse_function=parse_function)
    crawler = AsyncWebCrawler(start_urls=[page_url], extraction_strategy=extraction_strategy)

    try:
        # Run the crawler. The API is assumed to return a list of results.
        results = await crawler.run()
    except Exception as e:
        print(f"[{modality.upper()}] Error during crawling {page_url}: {e}")
        results = []

    scraped_urls = []
    # Assuming each result is a dict that contains the key "extracted" with the URLs.
    for result in results:
        scraped_urls.extend(result.get('extracted', []))

    # Remove duplicates and empty values
    scraped_urls = list({url for url in scraped_urls if url})

    # Save to CSV
    df = pd.DataFrame({'url': scraped_urls})
    df.to_csv(output_filename, index=False)
    print(f"[{modality.upper()}] Saved {len(scraped_urls)} URLs to {output_filename}")
    return scraped_urls

# Define parse functions (adjust CSS selectors based on the actual page structure)
def parse_image_links(response):
    # Example: extract src attribute from images with class "deepfake-img"
    return response.css("img.deepfake-img::attr(src)").getall()

def parse_video_links(response):
    # Example: extract href attribute from anchors with class "deepfake-video"
    return response.css("a.deepfake-video::attr(href)").getall()

def parse_audio_links(response):
    # Example: extract src attribute from audio tags with class "deepfake-audio"
    return response.css("audio.deepfake-audio::attr(src)").getall()

In [23]:
#########################################
# 2. Define Functions to Load URLs (with Fallbacks)
#########################################
def load_scraped_urls(modality):
    filepath = os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv")
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        urls = df['url'].dropna().tolist()
        print(f"[{modality.upper()}] Loaded {len(urls)} URLs from {filepath}")
        return urls
    else:
        print(f"[{modality.upper()}] File {filepath} not found. Using fallback method.")
        return []

# Fallback generators if scraping fails
def fallback_image_urls(n=20):
    # For images, use a site that returns a new image every time
    return ["https://thispersondoesnotexist.com/image"] * n

def fallback_video_urls(n=20):
    # Leave empty so that we trigger the yt-dlp search fallback later
    return []

def fallback_audio_urls(n=20):
    # Leave empty so that we can extract using Playwright later
    return []

In [24]:
#########################################
# 3. Define Asynchronous Download Functions with Retries & Metadata Logging
#########################################
async def download_file(session, url, filename, modality, retries=3):
    """
    Download a file (image or audio) asynchronously using aiohttp with simple retry logic.
    Saves the file to disk and logs download metadata.
    """
    for attempt in range(1, retries + 1):
        try:
            async with session.get(url) as resp:
                if resp.status == 200:
                    content = await resp.read()
                    with open(filename, "wb") as f:
                        f.write(content)
                    print(f"[{modality.upper()}] Downloaded: {filename}")

                    # Record metadata
                    download_metadata.append({
                        "modality": modality,
                        "filename": filename,
                        "url": url,
                        "download_time": datetime.now().isoformat()
                    })
                    return
                else:
                    print(f"[{modality.upper()}] Attempt {attempt}: Failed to download {url} (Status {resp.status})")
        except Exception as e:
            print(f"[{modality.upper()}] Attempt {attempt}: Exception downloading {url}: {e}")
        await asyncio.sleep(1)  # Wait a bit before retrying
    print(f"[{modality.upper()}] Failed to download {url} after {retries} attempts.")

async def download_images(urls):
    print("[IMAGE] Starting asynchronous image downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(IMAGES_DIR, f"image_{idx+1:02d}.jpg")
            tasks.append(download_file(session, url, filename, modality="image"))
        await asyncio.gather(*tasks)

async def download_audios(urls):
    if not urls:
        print("[AUDIO] No audio URLs provided for download.")
        return
    print("[AUDIO] Starting asynchronous audio downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(AUDIOS_DIR, f"audio_{idx+1:02d}.mp3")
            tasks.append(download_file(session, url, filename, modality="audio"))
        await asyncio.gather(*tasks)

In [25]:
#########################################
# 4. Define Synchronous Video Download Function Using yt-dlp
#########################################
def download_videos(video_urls, num_videos=20):
    """
    Download videos using yt-dlp. If no URLs are provided, use yt-dlp's search fallback.
    """
    if video_urls:
        download_list = video_urls[:num_videos]
        print("[VIDEO] Downloading videos from scraped URLs...")
        for url in download_list:
            ydl_opts = {
                'format': 'bestvideo+bestaudio/best',
                'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
                'merge_output_format': 'mp4',
                'quiet': True,
                'no_warnings': True,
            }
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                try:
                    ydl.download([url])
                    # Record a basic metadata record
                    download_metadata.append({
                        "modality": "video",
                        "filename": url,  # filename extraction would require additional parsing
                        "url": url,
                        "download_time": datetime.now().isoformat()
                    })
                    print(f"[VIDEO] Downloaded: {url}")
                except Exception as e:
                    print(f"[VIDEO] Error downloading {url}: {e}")
    else:
        # Fallback using yt-dlp search
        search_query = "ytsearch20:deepfake compilation"
        ydl_opts = {
            'format': 'bestvideo+bestaudio/best',
            'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
            'merge_output_format': 'mp4',
            'quiet': True,
            'no_warnings': True,
        }
        print("[VIDEO] No scraped video URLs found; using yt-dlp search fallback...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl.download([search_query])
                print("[VIDEO] Video downloads (fallback search) complete.")
            except Exception as e:
                print(f"[VIDEO] Error with yt-dlp search fallback: {e}")

In [26]:
#########################################
# 5. Define Audio Extraction Using Playwright as a Fallback
#########################################
async def extract_audio_links():
    """
    Use Playwright to extract audio URLs from a dynamic page (as a fallback if no scraped audio URLs exist).
    Adjust the CSS selectors as needed.
    """
    print("[AUDIO] Extracting audio URLs via Playwright fallback...")
    extracted_links = []
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()
            # Example URL; adjust to a real page with audio samples.
            url = "https://uberduck.ai/explore"
            await page.goto(url)
            # Wait for dynamic content to load (adjust timeout as needed)
            await page.wait_for_timeout(5000)

            # Extract audio elements; adjust the selector as needed.
            audio_elements = await page.query_selector_all("audio")
            for el in audio_elements:
                src = await el.get_attribute("src")
                if src:
                    extracted_links.append(src)
            await browser.close()
            print(f"[AUDIO] Extracted {len(extracted_links)} audio URLs via Playwright.")
    except Exception as e:
        print(f"[AUDIO] Error during Playwright audio extraction: {e}")
    return extracted_links

In [27]:
#########################################
# 6. Save Download Metadata to a JSON File
#########################################
def save_download_metadata(metadata_list, filename=METADATA_FILE):
    try:
        with open(filename, 'w') as f:
            json.dump(metadata_list, f, indent=4)
        print(f"Download metadata saved to {filename}")
    except Exception as e:
        print(f"Error saving download metadata: {e}")

In [28]:
#########################################
# 7. Main Pipeline
#########################################
async def main_pipeline():
    # --- Configuration: Define the source pages for each modality ---
    pages = {
        "images": "https://deepfakesampleimages.com/gallery",   # Change to actual URL
        "videos": "https://deepfakesamplevideos.com/collection",  # Change to actual URL
        "audio":  "https://deepfakesampleaudio.com/samples"       # Change to actual URL
    }

    # --- Scrape URLs Using crawl4ai ---
    image_urls = await scrape_media_urls("image", pages["images"], parse_image_links,
                                           os.path.join(SCRAPED_URLS_DIR, "image_urls.csv"))
    video_urls = await scrape_media_urls("video", pages["videos"], parse_video_links,
                                           os.path.join(SCRAPED_URLS_DIR, "video_urls.csv"))
    audio_urls = await scrape_media_urls("audio", pages["audio"], parse_audio_links,
                                           os.path.join(SCRAPED_URLS_DIR, "audio_urls.csv"))

    # --- Load URLs (if available); otherwise, fall back ---
    if not image_urls:
        image_urls = fallback_image_urls(20)
    else:
        image_urls = image_urls[:20]

    if not video_urls:
        video_urls = fallback_video_urls(20)
    else:
        video_urls = video_urls[:20]

    if not audio_urls:
        audio_urls = fallback_audio_urls(20)
    else:
        audio_urls = audio_urls[:20]

    # --- Download Images Asynchronously ---
    await download_images(image_urls)

    # --- Download Videos Synchronously via yt-dlp ---
    download_videos(video_urls, num_videos=20)

    # --- Download Audios ---
    # If audio_urls is empty, attempt extraction using Playwright fallback.
    if not audio_urls:
        audio_urls = await extract_audio_links()
    await download_audios(audio_urls)

    # --- Save Download Metadata ---
    save_download_metadata(download_metadata)
    print("All downloads complete. Check the respective directories for files.")


# Run the entire pipeline.
if __name__ == '__main__':
    asyncio.run(main_pipeline())

[IMAGE] Scraping URLs from https://deepfakesampleimages.com/gallery


TypeError: JsonCssExtractionStrategy.__init__() missing 1 required positional argument: 'schema'