<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [39]:
!pip install -qU yt-dlp aiohttp decord pandas pillow soundfile tqdm crawl4ai[all] nest_asyncio playwright telethon feedparser

In [48]:
import os
import asyncio
import aiohttp
import json
import subprocess
from datetime import datetime
import pandas as pd
import yt_dlp
from tqdm.asyncio import tqdm
from playwright.async_api import async_playwright
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

#########################################
# Setup Directories and Global Variables
#########################################
BASE_DIR = os.getcwd()
SCRAPED_URLS_DIR = os.path.join(BASE_DIR, 'scraped_urls')
IMAGES_DIR = os.path.join(BASE_DIR, 'deepfake_images')
VIDEOS_DIR = os.path.join(BASE_DIR, 'deepfake_videos')
AUDIOS_DIR = os.path.join(BASE_DIR, 'deepfake_audios')
METADATA_FILE = os.path.join(BASE_DIR, 'download_metadata.json')

# Create only the needed directories
for d in [SCRAPED_URLS_DIR, IMAGES_DIR, VIDEOS_DIR, AUDIOS_DIR]:
    os.makedirs(d, exist_ok=True)

download_metadata = []
download_stats = {
    "images": {"count": 0, "sources": {}},
    "videos": {"count": 0, "sources": {}},
    "audios": {"count": 0, "sources": {}},
}

#########################################
# Helper: Ensure Playwright Browsers Are Installed
#########################################
def install_playwright_browsers():
    try:
        print("[SYSTEM] Installing Playwright browsers...")
        subprocess.run(["playwright", "install"], check=True)
        print("[SYSTEM] Playwright browsers installed.")
    except Exception as e:
        print(f"[SYSTEM] Error installing Playwright browsers: {e}")

In [49]:
#########################################
# 1. URL Scraping Functions using Crawl4AI
#########################################
async def scrape_media_urls(modality, page_url, parse_function, output_filename):
    """
    Scrape media URLs from a given page URL using Crawl4AI’s AsyncWebCrawler.
    Saves URLs to CSV and returns a unique list.
    """
    print(f"[{modality.upper()}] Scraping URLs from {page_url}")
    schema = {"extracted": list}
    extraction_strategy = JsonCssExtractionStrategy(parse_function=parse_function, schema=schema)

    crawler_cfg = CrawlerRunConfig(
        exclude_external_links=True,
        exclude_social_media_links=True,
        exclude_external_images=True,
        wait_for_images=True,
        verbose=True
    )

    try:
        async with AsyncWebCrawler(extraction_strategy=extraction_strategy) as crawler:
            result = await crawler.arun(page_url, config=crawler_cfg)
    except Exception as e:
        print(f"[{modality.upper()}] Error crawling {page_url}: {e}")
        result = None

    scraped_urls = []
    if result and result.success:
        media_items = result.media.get(modality, [])
        scraped_urls = [item['src'] for item in media_items if item['src']]
    scraped_urls = list(set(scraped_urls))
    df = pd.DataFrame({'url': scraped_urls})
    df.to_csv(output_filename, index=False)
    print(f"[{modality.upper()}] Saved {len(scraped_urls)} URLs to {output_filename}")
    return scraped_urls

def parse_image_links(response):
    return response.css("img::attr(src)").getall()

def parse_video_links(response):
    return response.css("video source::attr(src)").getall()

def parse_audio_links(response):
    return response.css("audio::attr(src)").getall()

In [50]:
#########################################
# 2. Functions to Load URLs (with Fallbacks)
#########################################
def load_scraped_urls(modality):
    filepath = os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv")
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        urls = df['url'].dropna().tolist()
        print(f"[{modality.upper()}] Loaded {len(urls)} URLs from {filepath}")
        return urls
    else:
        print(f"[{modality.upper()}] File {filepath} not found. Using fallback method.")
        return []

def fallback_image_urls(n=20, seed_keyword="deepfake"):
    return [f"https://picsum.photos/seed/{seed_keyword}{i}/600/400" for i in range(n)]

def fallback_video_urls(n=20):
    return []  # Let yt-dlp search fallback handle video search if needed

def fallback_audio_urls(n=20):
    return []  # Return an empty list; our extraction fallback will be used

In [51]:
#########################################
# 3. Asynchronous Download Functions with Retry Logic
#########################################
async def download_file(session, url, filename, modality, source, retries=3):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    for attempt in range(1, retries + 1):
        try:
            async with session.get(url, headers=headers) as resp:
                if resp.status == 200:
                    content = await resp.read()
                    with open(filename, "wb") as f:
                        f.write(content)
                    print(f"[{modality.upper()}] Downloaded: {filename}")
                    download_metadata.append({
                        "modality": modality,
                        "filename": filename,
                        "url": url,
                        "source": source,
                        "download_time": datetime.now().isoformat()
                    })
                    download_stats[modality]["count"] += 1
                    if source in download_stats[modality]["sources"]:
                        download_stats[modality]["sources"][source] += 1
                    else:
                        download_stats[modality]["sources"][source] = 1
                    return
                else:
                    print(f"[{modality.upper()}] Attempt {attempt}: Status {resp.status} for {url}")
        except Exception as e:
            print(f"[{modality.upper()}] Attempt {attempt}: Exception for {url}: {e}")
        await asyncio.sleep(1)
    print(f"[{modality.upper()}] Failed to download {url} after {retries} attempts.")

async def download_images(urls, source):
    print("[IMAGE] Starting image downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(IMAGES_DIR, f"deepfake_image_{idx+1:03d}.jpg")
            tasks.append(download_file(session, url, filename, modality="image", source=source))
        await asyncio.gather(*tasks)

async def download_audios(urls, source):
    if not urls:
        print("[AUDIO] No audio URLs provided for download.")
        return
    print("[AUDIO] Starting audio downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(AUDIOS_DIR, f"deepfake_audio_{idx+1:03d}.mp3")
            tasks.append(download_file(session, url, filename, modality="audio", source=source))
        await asyncio.gather(*tasks)

def download_videos(video_urls, source, num_videos=20):
    if video_urls:
        download_list = video_urls[:num_videos]
        print("[VIDEO] Downloading videos from scraped URLs...")
        for url in download_list:
            ydl_opts = {
                'format': 'bestvideo+bestaudio/best',
                'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
                'merge_output_format': 'mp4',
                'quiet': True,
                'no_warnings': True,
            }
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                try:
                    ydl.download([url])
                    download_metadata.append({
                        "modality": "video",
                        "filename": url,
                        "url": url,
                        "source": source,
                        "download_time": datetime.now().isoformat()
                    })
                    download_stats["videos"]["count"] += 1
                    if source in download_stats["videos"]["sources"]:
                        download_stats["videos"]["sources"][source] += 1
                    else:
                        download_stats["videos"]["sources"][source] = 1
                    print(f"[VIDEO] Downloaded: {url}")
                except Exception as e:
                    print(f"[VIDEO] Error downloading {url}: {e}")
    else:
        search_query = "ytsearch20:deepfake compilation"
        ydl_opts = {
            'format': 'bestvideo+bestaudio/best',
            'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
            'merge_output_format': 'mp4',
            'quiet': True,
            'no_warnings': True,
        }
        print("[VIDEO] No scraped video URLs; using yt-dlp search fallback...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl.download([search_query])
                print("[VIDEO] Fallback video downloads complete.")
            except Exception as e:
                print(f"[VIDEO] yt-dlp search fallback error: {e}")

In [52]:
#########################################
# 4. Audio Extraction via Playwright (Fallback)
#########################################
async def extract_audio_links():
    print("[AUDIO] Extracting audio URLs via Playwright fallback...")
    extracted_links = []
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()
            url = "https://uberduck.ai/explore"
            await page.goto(url)
            await page.wait_for_selector("audio", state="visible", timeout=60000)
            audio_elements = await page.query_selector_all("audio")
            for el in audio_elements:
                src = await el.get_attribute("src")
                if src:
                    extracted_links.append(src)
            await browser.close()
            print(f"[AUDIO] Extracted {len(extracted_links)} audio URLs via Playwright.")
    except Exception as e:
        if "Executable doesn't exist" in str(e):
            install_playwright_browsers()
        print(f"[AUDIO] Playwright extraction error: {e}")
    return extracted_links

In [53]:
#########################################
# 5. Save Download Metadata to JSON
#########################################
def save_download_metadata(metadata_list, filename=METADATA_FILE):
    try:
        with open(filename, 'w') as f:
            json.dump(metadata_list, f, indent=4)
        print(f"[SYSTEM] Download metadata saved to {filename}")
    except Exception as e:
        print(f"[SYSTEM] Error saving metadata: {e}")

In [54]:
#########################################
# 6. Main Pipeline with Expanded Deepfake Sources
#########################################
async def main_pipeline():
    # VIDEO SOURCES: Each tuple is (Dataset name, URL)
    video_sources = [
        ("UADFV", "https://ai.facebook.com/datasets/dfdc/"),
        ("Deepfake-TIMIT", "https://github.com/ondyari/FaceForensics"),
        ("DFFD", "https://github.com/EndlessSora/DeeperForensics-1.0"),
        ("Celeb-DF", "https://github.com/cypw/Celeb-DF"),
        ("DFDC", "https://ai.facebook.com/datasets/dfdc/"),
        ("FaceForensics++", "https://github.com/ondyari/FaceForensics"),
        ("FFIW-10K", "https://github.com/ondyari/FaceForensics"),
        ("DeeperForensics-1.0", "https://github.com/EndlessSora/DeeperForensics-1.0"),
        ("WildDeepfake", "https://github.com/deepfakeinthewild/deepfake-in-the-wild"),
        ("ForgeryNet", "https://github.com/duxingdong/ForgeryNet"),
        ("AV-Deepfake1M", "https://github.com/ControlNet/AV-Deepfake1M"),
        ("DeepFake MNIST+", "https://github.com/yourplaceholder/DeepFakeMNISTplus"),
        ("VideoSham", "https://github.com/adobe-research/VideoSham-dataset"),
        ("RWDF-23", "https://rwdf23.example.com")  # Placeholder URL; replace with actual link if available
    ]

    # IMAGE SOURCES: Each tuple is (Dataset name, URL)
    image_sources = [
        ("DFFD_Image", "https://github.com/ondyari/FaceForensics"),
        ("iFakeFaceDB", "https://github.com/YourRepository/iFakeFaceDB"),  # Replace with actual URL if available
        ("100kFaces", "https://thiswaifudoesnotexist.net/"),
        ("FFHQ", "https://github.com/NVlabs/ffhq-dataset"),
        ("ForgeryNet_Image", "https://github.com/duxingdong/ForgeryNet"),
        ("DeepFakeRealImages", "https://www.kaggle.com/datasets/manjilkarki/deepfake-and-real-images"),
        ("DeepfakeDetection_Images", "https://github.com/yourrepository/deepfake-detection-images")  # Placeholder
    ]

    # AUDIO SOURCES: Each tuple is (Dataset name, URL)
    audio_sources = [
        ("WaveFake", "https://paperswithcode.com/dataset/wavefake"),
        ("DEEP-VOICE", "https://www.kaggle.com/datasets/birdy654/deep-voice-deepfake-voice-recognition"),
        ("InTheWild_Audio", "https://www.kaggle.com/datasets/abdallamohamed312/in-the-wild-audio-deepfake"),
        ("Codecfake", "https://arxiv.org/abs/2405.04880"),
        ("CrossDomain_ADD", "https://arxiv.org/abs/2404.04904"),
        ("DeepFake-Audio-Rangers_Arabic", "https://huggingface.co/datasets/DeepFake-Audio-Rangers/Arabic_Audio_Deepfake"),
        ("SONICS", "https://paperswithcode.com/dataset/sonics"),
        ("ASVspoof", "https://www.asvspoof.org/"),
        ("ADD2022", "https://example.com/add2022")  # Placeholder URL; update if available
    ]

    #############################
    # Process Video Sources
    #############################
    for name, url in video_sources:
        print(f"\n[VIDEO SOURCE: {name}]")
        output_file = os.path.join(SCRAPED_URLS_DIR, f"video_{name}_urls.csv")
        urls = await scrape_media_urls(name, url, parse_video_links, output_file)
        if urls:
            urls = urls[:20]
        else:
            print(f"[{name.upper()}] No URLs scraped; using fallback.")
            urls = fallback_video_urls(20)
        download_videos(urls, source=url, num_videos=20)

    #############################
    # Process Image Sources
    #############################
    for name, url in image_sources:
        print(f"\n[IMAGE SOURCE: {name}]")
        output_file = os.path.join(SCRAPED_URLS_DIR, f"image_{name}_urls.csv")
        urls = await scrape_media_urls(name, url, parse_image_links, output_file)
        if urls:
            urls = urls[:20]
        else:
            print(f"[{name.upper()}] No URLs scraped; using fallback.")
            urls = fallback_image_urls(20, seed_keyword=name)
        await download_images(urls, source=url)

    #############################
    # Process Audio Sources
    #############################
    all_audio_urls = []
    for name, url in audio_sources:
        print(f"\n[AUDIO SOURCE: {name}]")
        output_file = os.path.join(SCRAPED_URLS_DIR, f"audio_{name}_urls.csv")
        urls = await scrape_media_urls(name, url, parse_audio_links, output_file)
        if urls:
            print(f"[{name.upper()}] Scraped {len(urls)} audio URLs.")
            all_audio_urls.extend(urls)
        else:
            print(f"[{name.upper()}] No URLs scraped from page {url}.")
    # Remove duplicates and limit to 50 audio URLs.
    all_audio_urls = list(set(all_audio_urls))[:50]
    # If no audio URLs were found, use fallback extraction.
    if not all_audio_urls:
        print("[AUDIO] No audio URLs scraped from all sources; using Playwright extraction fallback.")
        all_audio_urls = await extract_audio_links()
    await download_audios(all_audio_urls, source="Playwright fallback")

    #############################
    # Save all download metadata
    #############################
    save_download_metadata(download_metadata)
    print("[SYSTEM] All downloads complete. Check the respective directories for files.")

    #############################
    # Print Download Statistics
    #############################
    print("\n[SYSTEM] Download Statistics:")
    print(f"Total Images Downloaded: {download_stats['images']['count']}")
    print(f"Total Videos Downloaded: {download_stats['videos']['count']}")
    print(f"Total Audios Downloaded: {download_stats['audios']['count']}")

    print("\n[SYSTEM] Source-wise Download Statistics:")
    print("Images:")
    for source, count in download_stats['images']['sources'].items():
        print(f"  - {source}: {count}")
    print("Videos:")
    for source, count in download_stats['videos']['sources'].items():
        print(f"  - {source}: {count}")
    print("Audios:")
    for source, count in download_stats['audios']['sources'].items():
        print(f"  - {source}: {count}")

In [55]:
#########################################
# 7. Entry Point with Event Loop Handling
#########################################
import nest_asyncio
nest_asyncio.apply()

try:
    loop = asyncio.get_running_loop()
except RuntimeError:
    loop = None

if loop and loop.is_running():
    print("[SYSTEM] Detected running event loop. Using loop.run_until_complete()...")
    loop.run_until_complete(main_pipeline())
else:
    asyncio.run(main_pipeline())

[SYSTEM] Detected running event loop. Using loop.run_until_complete()...

[VIDEO SOURCE: UADFV]
[UADFV] Scraping URLs from https://ai.facebook.com/datasets/dfdc/
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://ai.facebook.com/datasets/dfdc/... | Status: True | Time: 0.01s
[COMPLETE] ● https://ai.facebook.com/datasets/dfdc/... | Status: True | Total: 0.42s
[UADFV] Saved 0 URLs to /content/scraped_urls/video_UADFV_urls.csv
[UADFV] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: Deepfake-TIMIT]
[DEEPFAKE-TIMIT] Scraping URLs from https://github.com/ondyari/FaceForensics
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/ondyari/FaceForensics... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/ondyari/FaceForensics... | Status: True | Total: 0.40s
[DEEPFAKE-TIMIT] Saved 0 URLs to /content/scraped_urls/video_Deepfake-TIMIT_urls.csv
[DEEPFAKE-TIMIT] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: DFFD]
[DFFD] Scraping URLs from https://github.com/EndlessSora/DeeperForensics-1.0
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/EndlessSora/DeeperForensics-1.0... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/EndlessSora/DeeperForensics-1.0... | Status: True | Total: 0.43s
[DFFD] Saved 0 URLs to /content/scraped_urls/video_DFFD_urls.csv
[DFFD] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: Celeb-DF]
[CELEB-DF] Scraping URLs from https://github.com/cypw/Celeb-DF
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/cypw/Celeb-DF... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/cypw/Celeb-DF... | Status: True | Total: 0.42s
[CELEB-DF] Saved 0 URLs to /content/scraped_urls/video_Celeb-DF_urls.csv
[CELEB-DF] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: DFDC]
[DFDC] Scraping URLs from https://ai.facebook.com/datasets/dfdc/
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://ai.facebook.com/datasets/dfdc/... | Status: True | Time: 0.01s
[COMPLETE] ● https://ai.facebook.com/datasets/dfdc/... | Status: True | Total: 0.44s
[DFDC] Saved 0 URLs to /content/scraped_urls/video_DFDC_urls.csv
[DFDC] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: FaceForensics++]
[FACEFORENSICS++] Scraping URLs from https://github.com/ondyari/FaceForensics
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/ondyari/FaceForensics... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/ondyari/FaceForensics... | Status: True | Total: 0.44s
[FACEFORENSICS++] Saved 0 URLs to /content/scraped_urls/video_FaceForensics++_urls.csv
[FACEFORENSICS++] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: FFIW-10K]
[FFIW-10K] Scraping URLs from https://github.com/ondyari/FaceForensics
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/ondyari/FaceForensics... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/ondyari/FaceForensics... | Status: True | Total: 0.45s
[FFIW-10K] Saved 0 URLs to /content/scraped_urls/video_FFIW-10K_urls.csv
[FFIW-10K] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: DeeperForensics-1.0]
[DEEPERFORENSICS-1.0] Scraping URLs from https://github.com/EndlessSora/DeeperForensics-1.0
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/EndlessSora/DeeperForensics-1.0... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/EndlessSora/DeeperForensics-1.0... | Status: True | Total: 0.44s
[DEEPERFORENSICS-1.0] Saved 0 URLs to /content/scraped_urls/video_DeeperForensics-1.0_urls.csv
[DEEPERFORENSICS-1.0] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: WildDeepfake]
[WILDDEEPFAKE] Scraping URLs from https://github.com/deepfakeinthewild/deepfake-in-the-wild
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/deepfakeinthewild/deepfake-in-t... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/deepfakeinthewild/deepfake-in-t... | Status: True | Total: 0.45s
[WILDDEEPFAKE] Saved 0 URLs to /content/scraped_urls/video_WildDeepfake_urls.csv
[WILDDEEPFAKE] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: ForgeryNet]
[FORGERYNET] Scraping URLs from https://github.com/duxingdong/ForgeryNet
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/duxingdong/ForgeryNet... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/duxingdong/ForgeryNet... | Status: True | Total: 0.48s
[FORGERYNET] Saved 0 URLs to /content/scraped_urls/video_ForgeryNet_urls.csv
[FORGERYNET] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: AV-Deepfake1M]
[AV-DEEPFAKE1M] Scraping URLs from https://github.com/ControlNet/AV-Deepfake1M
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/ControlNet/AV-Deepfake1M... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/ControlNet/AV-Deepfake1M... | Status: True | Total: 0.45s
[AV-DEEPFAKE1M] Saved 0 URLs to /content/scraped_urls/video_AV-Deepfake1M_urls.csv
[AV-DEEPFAKE1M] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: DeepFake MNIST+]
[DEEPFAKE MNIST+] Scraping URLs from https://github.com/yourplaceholder/DeepFakeMNISTplus
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/yourplaceholder/DeepFakeMNISTpl... | Status: True | Time: 0.04s
[COMPLETE] ● https://github.com/yourplaceholder/DeepFakeMNISTpl... | Status: True | Total: 0.47s
[DEEPFAKE MNIST+] Saved 0 URLs to /content/scraped_urls/video_DeepFake MNIST+_urls.csv
[DEEPFAKE MNIST+] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: VideoSham]
[VIDEOSHAM] Scraping URLs from https://github.com/adobe-research/VideoSham-dataset
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/adobe-research/VideoSham-datase... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/adobe-research/VideoSham-datase... | Status: True | Total: 0.46s
[VIDEOSHAM] Saved 0 URLs to /content/scraped_urls/video_VideoSham_urls.csv
[VIDEOSHAM] No URLs scraped; using fallback.
[VIDEO] No scraped video URLs; using yt-dlp search fallback...


ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[VIDEO SOURCE: RWDF-23]
[RWDF-23] Scraping URLs from https://rwdf23.example.com
[INIT].... → Crawl4AI 0.4.248
[ERROR]... × https://rwdf23.example.com... | Error: 
┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ × Unexpected error in _crawl_web at line 1354 in _crawl_web (../usr/local/lib/python3.11/dist-                        │
│ packages/crawl4ai/async_crawler_strategy.py):                                                                         │
│   Error: Failed on navigating ACS-GOTO:                                                                               │
│   Page.goto: net::ERR_NAME_NOT_RESOLVED at https://rwdf23.example.com/                                                │
│   Call log:                                                                                                           │
│ 

ERROR: unable to download video data: HTTP Error 403: Forbidden


[VIDEO] yt-dlp search fallback error: ERROR: unable to download video data: HTTP Error 403: Forbidden

[IMAGE SOURCE: DFFD_Image]
[DFFD_IMAGE] Scraping URLs from https://github.com/ondyari/FaceForensics
[INIT].... → Crawl4AI 0.4.248
[FETCH]... ↓ https://github.com/ondyari/FaceForensics... | Status: True | Time: 0.01s
[COMPLETE] ● https://github.com/ondyari/FaceForensics... | Status: True | Total: 0.48s
[DFFD_IMAGE] Saved 0 URLs to /content/scraped_urls/image_DFFD_Image_urls.csv
[DFFD_IMAGE] No URLs scraped; using fallback.
[IMAGE] Starting image downloads...
[IMAGE] Downloaded: /content/deepfake_images/deepfake_image_003.jpg
[IMAGE] Attempt 1: Exception for https://picsum.photos/seed/DFFD_Image2/600/400: 'image'
[IMAGE] Downloaded: /content/deepfake_images/deepfake_image_017.jpg
[IMAGE] Attempt 1: Exception for https://picsum.photos/seed/DFFD_Image16/600/400: 'image'
[IMAGE] Downloaded: /content/deepfake_images/deepfake_image_007.jpg
[IMAGE] Attempt 1: Exception for https://picsum.phot