<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [14]:
!pip install -qU yt-dlp aiohttp decord pandas pillow soundfile tqdm crawl4ai[all] nest_asyncio playwright telethon

In [15]:
import os
import asyncio
import aiohttp
import json
import subprocess
from datetime import datetime
import pandas as pd
import yt_dlp
from tqdm.asyncio import tqdm
from playwright.async_api import async_playwright
import feedparser

# Import crawl4ai components (if available)
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

#########################################
# Setup Directories and Global Variables
#########################################
BASE_DIR = os.getcwd()
SCRAPED_URLS_DIR = os.path.join(BASE_DIR, 'scraped_urls')
IMAGES_DIR      = os.path.join(BASE_DIR, 'deepfake_images')
VIDEOS_DIR      = os.path.join(BASE_DIR, 'deepfake_videos')
AUDIOS_DIR      = os.path.join(BASE_DIR, 'deepfake_audios')
TEXT_DIR        = os.path.join(BASE_DIR, 'deepfake_texts')
METADATA_FILE   = os.path.join(BASE_DIR, 'download_metadata.json')

for d in [SCRAPED_URLS_DIR, IMAGES_DIR, VIDEOS_DIR, AUDIOS_DIR, TEXT_DIR]:
    os.makedirs(d, exist_ok=True)

download_metadata = []

#########################################
# Helper: Ensure Playwright Browsers Are Installed
#########################################
def install_playwright_browsers():
    try:
        print("[SYSTEM] Installing Playwright browsers...")
        subprocess.run(["playwright", "install"], check=True)
        print("[SYSTEM] Playwright browsers installed.")
    except Exception as e:
        print(f"[SYSTEM] Error installing Playwright browsers: {e}")

In [16]:
#########################################
# 1. URL Scraping Functions using crawl4ai and feedparser
#########################################
async def scrape_media_urls(modality, page_url, parse_function, output_filename):
    """
    Scrape media URLs from a given page URL using crawl4ai's AsyncWebCrawler.
    Saves URLs to CSV and returns a unique list.
    """
    print(f"[{modality.upper()}] Scraping URLs from {page_url}")
    schema = {"extracted": list}
    extraction_strategy = JsonCssExtractionStrategy(parse_function=parse_function, schema=schema)
    crawler = AsyncWebCrawler(start_urls=[page_url], extraction_strategy=extraction_strategy)
    try:
        results = await crawler.start()
    except Exception as e:
        print(f"[{modality.upper()}] Error crawling {page_url}: {e}")
        results = []
    scraped_urls = []
    for result in results:
        scraped_urls.extend(result.get('extracted', []))
    scraped_urls = list({url for url in scraped_urls if url})
    df = pd.DataFrame({'url': scraped_urls})
    df.to_csv(output_filename, index=False)
    print(f"[{modality.upper()}] Saved {len(scraped_urls)} URLs to {output_filename}")
    return scraped_urls

def parse_image_links(response):
    return response.css("img::attr(src)").getall()

def parse_video_links(response):
    return response.css("a::attr(href)").getall() or response.css("video::attr(src)").getall()

def parse_audio_links(response):
    return response.css("audio::attr(src)").getall()

def parse_text_links(response):
    return response.css("a::attr(href)").getall()

def parse_generic_links(response):
    # Generic parser to extract all links from anchor tags.
    return response.css("a::attr(href)").getall()

async def scrape_rss_feed_urls(modality, page_url, output_filename):
    """
    Scrape an RSS feed from a given URL using feedparser.
    Filters entries that mention 'deepfake' (case-insensitive) in their title or summary.
    Saves unique article links to CSV.
    """
    print(f"[{modality.upper()}] Scraping RSS feed from {page_url}")
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(page_url) as resp:
                feed_text = await resp.text()
        feed = feedparser.parse(feed_text)
        rss_urls = []
        for entry in feed.entries:
            # Filter entries that mention deepfake in title or summary
            if ('deepfake' in entry.title.lower()) or ('deepfake' in entry.get('summary', '').lower()):
                rss_urls.append(entry.link)
        rss_urls = list(set(rss_urls))
        df = pd.DataFrame({'url': rss_urls})
        df.to_csv(output_filename, index=False)
        print(f"[{modality.upper()}] Saved {len(rss_urls)} RSS feed URLs to {output_filename}")
        return rss_urls
    except Exception as e:
        print(f"[{modality.upper()}] Error scraping RSS feed from {page_url}: {e}")
        return []

In [17]:
#########################################
# 2. Functions to Load URLs (with Fallbacks)
#########################################
def load_scraped_urls(modality):
    filepath = os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv")
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        urls = df['url'].dropna().tolist()
        print(f"[{modality.upper()}] Loaded {len(urls)} URLs from {filepath}")
        return urls
    else:
        print(f"[{modality.upper()}] File {filepath} not found. Using fallback method.")
        return []

def fallback_image_urls(n=20, seed_keyword="deepfake"):
    return [f"https://picsum.photos/seed/{seed_keyword}{i}/600/400" for i in range(n)]

def fallback_video_urls(n=20):
    return []  # Let yt-dlp fallback handle this

def fallback_audio_urls(n=20):
    return []

def fallback_text_urls(n=20):
    sample_articles = [
        "https://www.theguardian.com/technology/2024/oct/16/its-not-me-its-just-my-face-the-models-who-found-their-likenesses-had-been-used-in-ai-propaganda",
        "https://www.wired.com/story/generative-ai-deepfakes/",
        "https://www.ft.com/content/7f22ce59-1c6c-4d84-bca8-dc539992e286",
        "https://nypost.com/2024/07/17/lifestyle/how-people-are-being-tricked-by-deepfake-doctor-videos/",
        "https://www.openfox.com/deepfakes-and-their-impact-on-society/"
    ]
    return (sample_articles * ((n // len(sample_articles)) + 1))[:n]

In [18]:
#########################################
# 3. Asynchronous Download Functions with Retry Logic
#########################################
async def download_file(session, url, filename, modality, retries=3):
    for attempt in range(1, retries + 1):
        try:
            async with session.get(url) as resp:
                if resp.status == 200:
                    content = await resp.read()
                    with open(filename, "wb") as f:
                        f.write(content)
                    print(f"[{modality.upper()}] Downloaded: {filename}")
                    download_metadata.append({
                        "modality": modality,
                        "filename": filename,
                        "url": url,
                        "download_time": datetime.now().isoformat()
                    })
                    return
                else:
                    print(f"[{modality.upper()}] Attempt {attempt}: Status {resp.status} for {url}")
        except Exception as e:
            print(f"[{modality.upper()}] Attempt {attempt}: Exception for {url}: {e}")
        await asyncio.sleep(1)
    print(f"[{modality.upper()}] Failed to download {url} after {retries} attempts.")

async def download_images(urls):
    print("[IMAGE] Starting image downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(IMAGES_DIR, f"deepfake_image_{idx+1:03d}.jpg")
            tasks.append(download_file(session, url, filename, modality="image"))
        await asyncio.gather(*tasks)

async def download_audios(urls):
    if not urls:
        print("[AUDIO] No audio URLs provided for download.")
        return
    print("[AUDIO] Starting audio downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(AUDIOS_DIR, f"deepfake_audio_{idx+1:03d}.mp3")
            tasks.append(download_file(session, url, filename, modality="audio"))
        await asyncio.gather(*tasks)

async def download_texts(urls):
    if not urls:
        print("[TEXT] No text URLs provided for download.")
        return
    print("[TEXT] Starting text downloads...")
    async with aiohttp.ClientSession() as session:
        tasks = []
        for idx, url in enumerate(urls):
            filename = os.path.join(TEXT_DIR, f"deepfake_text_{idx+1:03d}.html")
            tasks.append(download_file(session, url, filename, modality="text"))
        await asyncio.gather(*tasks)

def download_videos(video_urls, num_videos=20):
    if video_urls:
        download_list = video_urls[:num_videos]
        print("[VIDEO] Downloading videos from scraped URLs...")
        for url in download_list:
            ydl_opts = {
                'format': 'bestvideo+bestaudio/best',
                'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
                'merge_output_format': 'mp4',
                'quiet': True,
                'no_warnings': True,
            }
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                try:
                    ydl.download([url])
                    download_metadata.append({
                        "modality": "video",
                        "filename": url,
                        "url": url,
                        "download_time": datetime.now().isoformat()
                    })
                    print(f"[VIDEO] Downloaded: {url}")
                except Exception as e:
                    print(f"[VIDEO] Error downloading {url}: {e}")
    else:
        search_query = "ytsearch20:deepfake compilation"
        ydl_opts = {
            'format': 'bestvideo+bestaudio/best',
            'outtmpl': os.path.join(VIDEOS_DIR, '%(id)s.%(ext)s'),
            'merge_output_format': 'mp4',
            'quiet': True,
            'no_warnings': True,
        }
        print("[VIDEO] No scraped video URLs; using yt-dlp search fallback...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl.download([search_query])
                print("[VIDEO] Fallback video downloads complete.")
            except Exception as e:
                print(f"[VIDEO] yt-dlp search fallback error: {e}")

In [19]:
#########################################
# 4. Audio Extraction via Playwright (Fallback)
#########################################
async def extract_audio_links():
    print("[AUDIO] Extracting audio URLs via Playwright fallback...")
    extracted_links = []
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()
            url = "https://uberduck.ai/explore"
            await page.goto(url)
            await page.wait_for_timeout(5000)
            audio_elements = await page.query_selector_all("audio")
            for el in audio_elements:
                src = await el.get_attribute("src")
                if src:
                    extracted_links.append(src)
            await browser.close()
            print(f"[AUDIO] Extracted {len(extracted_links)} audio URLs via Playwright.")
    except Exception as e:
        if "Executable doesn't exist" in str(e):
            install_playwright_browsers()
        print(f"[AUDIO] Playwright extraction error: {e}")
    return extracted_links

In [20]:
#########################################
# 5. Save Download Metadata to JSON
#########################################
def save_download_metadata(metadata_list, filename=METADATA_FILE):
    try:
        with open(filename, 'w') as f:
            json.dump(metadata_list, f, indent=4)
        print(f"[SYSTEM] Download metadata saved to {filename}")
    except Exception as e:
        print(f"[SYSTEM] Error saving metadata: {e}")

In [21]:
#########################################
# 6. Main Pipeline with Expanded Modalities and Additional Sources
#########################################
async def main_pipeline():
    # Define source pages for various modalities.
    # Existing sources are kept, and new entries are added for more specific human-manipulated content.
    pages = {
        # Existing/general sources
        "politician_manipulated_images": "https://github.com/ondyari/FaceForensics",
        "celebrity_manipulated_videos": "https://sites.google.com/view/celebdf",
        "human_manipulation_audios": "https://www.asvspoof.org/2019/index.html",
        "deepfake_voice_detection": "https://www.kaggle.com/datasets/search?query=deepfake+voice",
        "deepfake_video_detection": "https://www.kaggle.com/datasets/search?query=deepfake+video+detection",
        "ai_generated_images": "https://thiswaifudoesnotexist.net/",
        "deepfake_datasets_github": "https://github.com/Daisy-Zhang/Awesome-Deepfakes",
        "kaggle_deepfake_detection": "https://www.kaggle.com/datasets/sanikatiwarekar/deep-fake-detection-dfd-entire-original-dataset",
        "deepfake_news_rss": "https://www.wired.com/feed/category/tech/latest/rss",
        # New specific sources for human manipulated media
        "face_swapping": "https://github.com/deepfakes/faceswap",  # Example GitHub repo for face swapping
        "face_forgery": "https://github.com/ondyari/FaceForensics",  # Reusing an established source
        "speaker_verification_spoofing": "https://www.asvspoof.org/",  # ASVspoof website for spoofing attacks
        "audio_tampering_splicing": "https://example.com/datasets/audio_tampering",  # Replace with actual source
        "replay_attacks": "https://www.kaggle.com/datasets/xxxx/replay-attacks",       # Replace with actual source
        "multi_modal_fake_detection": "https://example.com/datasets/multimodal-fake",     # Replace with actual source
        "general_video_manipulation": "https://example.com/datasets/video-manipulation",  # Replace with actual source
        "image_splicing_copy_move": "https://example.com/datasets/image-splicing",        # Replace with actual source
        "ai_generated_image_manipulations": "https://thiswaifudoesnotexist.net/",         # Using the existing AI image generator
        "large_scale_photoshop_edits": "https://example.com/datasets/photoshop-edits"      # Replace with actual source
    }

    # Process each modality/source:
    for modality, page_url in pages.items():
        # Choose the extraction strategy based on modality keywords.
        if modality == "deepfake_news_rss":
            urls = await scrape_rss_feed_urls(modality, page_url, os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        elif modality in ["deepfake_datasets_github", "kaggle_deepfake_detection"]:
            urls = await scrape_media_urls(modality, page_url, parse_generic_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        elif any(key in modality for key in ["image", "ai_generated", "face_forgery", "image_splicing"]):
            urls = await scrape_media_urls(modality, page_url, parse_image_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        elif any(key in modality for key in ["video", "celebrity", "general_video", "deepfake_video_detection"]):
            urls = await scrape_media_urls(modality, page_url, parse_video_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        elif any(key in modality for key in ["audio", "voice", "speaker", "audio_tampering", "replay_attacks"]):
            urls = await scrape_media_urls(modality, page_url, parse_audio_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        elif "text" in modality:
            urls = await scrape_media_urls(modality, page_url, parse_text_links,
                                           os.path.join(SCRAPED_URLS_DIR, f"{modality}_urls.csv"))
        else:
            urls = []

        # If no URLs were scraped, use fallback URLs for that modality.
        if not urls:
            print(f"[{modality.upper()}] No scraped URLs; using fallback.")
            if any(key in modality for key in ["image", "ai_generated", "face", "splicing"]):
                seed = modality
                urls = fallback_image_urls(20, seed_keyword=seed)
            elif any(key in modality for key in ["video"]):
                urls = fallback_video_urls(20)
            elif any(key in modality for key in ["audio", "voice"]):
                urls = fallback_audio_urls(20)
            elif "text" in modality:
                urls = fallback_text_urls(20)
        else:
            urls = urls[:20]  # Limit to 20 URLs per source

        # Download files based on modality keywords.
        if any(key in modality for key in ["image", "ai_generated", "face_forgery", "face_swapping", "splicing"]):
            await download_images(urls)
        elif any(key in modality for key in ["video", "celebrity", "general_video", "deepfake_video_detection"]):
            download_videos(urls, num_videos=20)
        elif any(key in modality for key in ["audio", "voice", "speaker", "audio_tampering", "replay_attacks"]):
            await download_audios(urls)
        elif "text" in modality:
            await download_texts(urls)
        elif modality == "deepfake_news_rss":
            print(f"[{modality.upper()}] RSS feed URLs collected; consider further processing them for full articles.")
        else:
            print(f"[{modality.upper()}] Modality not recognized for downloading.")

    # Extra audio extraction via Playwright fallback
    extra_audio_urls = await extract_audio_links()
    if extra_audio_urls:
        await download_audios(extra_audio_urls)

    # Save all download metadata
    save_download_metadata(download_metadata)
    print("[SYSTEM] All downloads complete. Check the respective directories for files.")

In [22]:
#########################################
# Entry Point with Fixes for Running Event Loop
#########################################
import nest_asyncio
nest_asyncio.apply()

try:
    loop = asyncio.get_running_loop()
except RuntimeError:
    loop = None

if loop and loop.is_running():
    print("[SYSTEM] Detected running event loop. Using loop.run_until_complete()...")
    loop.run_until_complete(main_pipeline())
else:
    asyncio.run(main_pipeline())

[SYSTEM] Detected running event loop. Using loop.run_until_complete()...
[POLITICIAN_MANIPULATED_IMAGES] Scraping URLs from https://github.com/ondyari/FaceForensics
[INIT].... → Crawl4AI 0.4.248


TypeError: 'AsyncWebCrawler' object is not iterable