<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [7]:
!pip install -q yt-dlp aiohttp decord pandas pillow soundfile tqdm

In [8]:
import os
import logging
import shutil
import json
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import decord
from decord import VideoReader
import hashlib
import aiohttp
import asyncio
from PIL import Image
import soundfile as sf
from typing import Dict, List, Optional, Tuple

# Import yt_dlp for video downloads
import yt_dlp

In [9]:
# Initialize logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [10]:
class DeepfakeMediaCollector:
    def __init__(
        self,
        base_dir: str = "./deepfake_dataset",
        max_samples: int = 20,  # Download 20 samples per category
        max_retries: int = 3,
        timeout: int = 30,
        max_workers: int = 4  # Controls concurrent downloads
    ):
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.max_retries = max_retries
        self.timeout = timeout
        self.max_workers = max_workers
        self.metadata: List[Dict] = []
        self.temp_dir = self.base_dir / "temp"
        self.session: Optional[aiohttp.ClientSession] = None
        self.semaphore = asyncio.Semaphore(self.max_workers)
        self._create_directory_structure()

    def _create_directory_structure(self):
        """Create the necessary directory structure with error handling."""
        try:
            for dir_type in ['video', 'image', 'audio']:
                for category in ['real', 'fake']:
                    (self.base_dir / dir_type / category).mkdir(parents=True, exist_ok=True)
            self.temp_dir.mkdir(parents=True, exist_ok=True)
        except Exception as e:
            logger.error(f"Failed to create directory structure: {str(e)}")
            raise

    async def init_session(self):
        """Initialize a single aiohttp session for all downloads."""
        if self.session is None:
            self.session = aiohttp.ClientSession()

    async def download_file(self, url: str, output_path: Path) -> bool:
        """Download a single file with retry logic using the shared session."""
        # Skip download if file already exists and is nonempty
        if output_path.exists() and output_path.stat().st_size > 0:
            logger.info(f"File already exists, skipping: {output_path}")
            return True

        async with self.semaphore:
            for attempt in range(self.max_retries):
                try:
                    async with self.session.get(url, timeout=self.timeout) as response:
                        if response.status != 200:
                            logger.warning(f"Attempt {attempt + 1}: HTTP {response.status} for {url}")
                            continue

                        total_size = int(response.headers.get('content-length', 0))
                        with open(output_path, 'wb') as f, tqdm(
                            desc=output_path.name,
                            total=total_size,
                            unit='iB',
                            unit_scale=True
                        ) as pbar:
                            async for chunk in response.content.iter_chunked(8192):
                                size = f.write(chunk)
                                pbar.update(size)
                        return True

                except asyncio.TimeoutError:
                    logger.warning(f"Timeout on attempt {attempt + 1} for {url}")
                except Exception as e:
                    logger.error(f"Download failed on attempt {attempt + 1} for {url}: {str(e)}")
                    if output_path.exists():
                        output_path.unlink()
                await asyncio.sleep(2 ** attempt)  # Exponential backoff
            return False

    async def download_with_yt_dlp(self, url: str, output_dir: Path) -> Optional[str]:
        """
        Download a video using yt-dlp.
        The output directory is specified, and a hook captures the final filename.
        """
        downloaded_file = None

        def hook(d):
            nonlocal downloaded_file
            if d.get('status') == 'finished':
                downloaded_file = d.get('filename')

        ydl_opts = {
            'outtmpl': str(output_dir / '%(title)s.%(ext)s'),
            'quiet': True,
            'no_warnings': True,
            'progress_hooks': [hook],
        }
        try:
            # Run yt-dlp download in a separate thread.
            await asyncio.to_thread(lambda: yt_dlp.YoutubeDL(ydl_opts).download([url]))
            return downloaded_file
        except Exception as e:
            logger.error(f"yt-dlp download failed for {url}: {str(e)}")
            return None

    def validate_media_file(self, file_path: Path, media_type: str) -> Tuple[bool, Optional[str]]:
        """Validate downloaded media files."""
        if not file_path.exists():
            return False, "File does not exist"
        if file_path.stat().st_size == 0:
            return False, "File is empty"
        try:
            if media_type == 'video':
                vr = VideoReader(str(file_path))
                if len(vr) == 0:
                    return False, "Video has no frames"
                _ = vr[0].asnumpy()
                return True, None
            elif media_type == 'image':
                with Image.open(file_path) as img:
                    img.verify()
                return True, None
            elif media_type == 'audio':
                data, _ = sf.read(file_path)
                if len(data) == 0:
                    return False, "Audio file is empty"
                return True, None
        except Exception as e:
            return False, str(e)

    def add_to_metadata(self, modality: str, category: str, file_path: Path):
        """Add file metadata to collection."""
        try:
            with open(file_path, 'rb') as f:
                content = f.read()
                md5_hash = hashlib.md5(content).hexdigest()
                sha256_hash = hashlib.sha256(content).hexdigest()
            file_stats = file_path.stat()
            metadata_entry = {
                'modality': modality,
                'category': category,
                'filename': file_path.name,
                'file_path': str(file_path),
                'file_size': file_stats.st_size,
                'md5_hash': md5_hash,
                'sha256_hash': sha256_hash,
                'creation_time': file_stats.st_ctime,
                'modification_time': file_stats.st_mtime
            }
            # Add modality-specific metadata.
            if modality == 'video':
                vr = VideoReader(str(file_path))
                metadata_entry.update({
                    'frame_count': len(vr),
                    'width': vr[0].shape[1],
                    'height': vr[0].shape[0],
                })
            elif modality == 'image':
                with Image.open(file_path) as img:
                    metadata_entry.update({
                        'width': img.width,
                        'height': img.height,
                        'mode': img.mode,
                        'format': img.format,
                    })
            elif modality == 'audio':
                data, samplerate = sf.read(file_path)
                metadata_entry.update({
                    'duration': len(data) / samplerate,
                    'samplerate': samplerate,
                    'channels': data.shape[1] if len(data.shape) > 1 else 1,
                })
            self.metadata.append(metadata_entry)
        except Exception as e:
            logger.error(f"Failed to add metadata for {file_path}: {str(e)}")

    async def process_url(self, url: str, category: str, media_type: str):
        """
        Process a single URL: download (using yt-dlp for supported sites),
        validate, and add metadata.
        """
        if any(domain in url for domain in ['youtube.com', 'youtu.be']):
            output_dir = self.base_dir / media_type / category
            output_dir.mkdir(parents=True, exist_ok=True)
            logger.info(f"Using yt-dlp to download: {url}")
            downloaded_file = await self.download_with_yt_dlp(url, output_dir)
            if downloaded_file:
                output_path = Path(downloaded_file)
                is_valid, error_msg = self.validate_media_file(output_path, media_type)
                if is_valid:
                    self.add_to_metadata(media_type, category, output_path)
                else:
                    logger.error(f"Invalid {media_type} file {output_path}: {error_msg}")
                    output_path.unlink(missing_ok=True)
            else:
                logger.error(f"Failed to download with yt-dlp: {url}")
        else:
            output_path = self.base_dir / media_type / category / Path(url).name
            if await self.download_file(url, output_path):
                is_valid, error_msg = self.validate_media_file(output_path, media_type)
                if is_valid:
                    self.add_to_metadata(media_type, category, output_path)
                else:
                    logger.error(f"Invalid {media_type} file {output_path}: {error_msg}")
                    output_path.unlink(missing_ok=True)

    async def process_urls(self, urls_dict: Dict[str, List[str]], media_type: str):
        """Process a batch of URLs for a specific media type."""
        tasks = [
            self.process_url(url, category, media_type)
            for category, urls in urls_dict.items()
            for url in urls[:self.max_samples]
        ]
        await asyncio.gather(*tasks)

    def save_metadata(self):
        """Save metadata and generate summary."""
        if not self.metadata:
            logger.warning("No metadata to save")
            return
        try:
            metadata_df = pd.DataFrame(self.metadata)
            metadata_df.to_csv(self.base_dir / "metadata.csv", index=False)
            summary = {
                'total_files': len(metadata_df),
                'total_size_mb': metadata_df['file_size'].sum() / (1024 * 1024),
                'by_modality': metadata_df.groupby('modality')['filename'].count().to_dict(),
                'by_category': metadata_df.groupby('category')['filename'].count().to_dict()
            }
            with open(self.base_dir / "summary.json", 'w') as f:
                json.dump(summary, f, indent=2)
            logger.info(f"Metadata and summary saved to {self.base_dir}")
        except Exception as e:
            logger.error(f"Failed to save metadata: {str(e)}")

    async def close_session(self):
        """Properly close the aiohttp session."""
        if self.session:
            await self.session.close()

    async def cleanup(self):
        """Clean up temporary files and close session."""
        try:
            if self.temp_dir.exists():
                shutil.rmtree(self.temp_dir)
            logger.info("Cleanup completed successfully")
        except Exception as e:
            logger.error(f"Cleanup failed: {str(e)}")
        await self.close_session()

async def run_collector(collector: DeepfakeMediaCollector, urls: Dict[str, Dict[str, List[str]]]):
    """Run the collector with proper async handling."""
    await collector.init_session()
    # Process each modality separately by passing only the corresponding dictionary.
    await collector.process_urls(urls['video'], 'video')
    await collector.process_urls(urls['image'], 'image')
    await collector.process_urls(urls['audio'], 'audio')
    collector.save_metadata()
    await collector.cleanup()

In [14]:
def initialize_urls():
    """
    Initialize URLs for the DeepfakeMediaCollector using YouTube videos and other sources.
    Returns a dictionary containing URLs for video, image, and audio samples.
    """

    # Real videos (verified authentic content from official channels)
    real_video_urls = [
        # Music Videos
        "https://www.youtube.com/watch?v=9bZkp7q19f0",  # PSY - Gangnam Style
        "https://www.youtube.com/watch?v=dQw4w9WgXcQ",  # Rick Astley - Never Gonna Give You Up
        "https://www.youtube.com/watch?v=kJQP7kiw5Fk",  # Luis Fonsi - Despacito
        # Sports Highlights
        "https://www.youtube.com/watch?v=jofNR_WkoCE",  # Official NFL Highlights
        "https://www.youtube.com/watch?v=ZnXA0PoEE6Y",  # NBA Top Plays
        # Nature/Documentary
        "https://www.youtube.com/watch?v=K1Y6PchDYfw",  # National Geographic
        "https://www.youtube.com/watch?v=B1wOK9yGUYM",  # BBC Earth
        # Tech Reviews
        "https://www.youtube.com/watch?v=mW6hFttt_KE",  # MKBHD Tech Review
        "https://www.youtube.com/watch?v=8jD6F1F4sds",  # Linus Tech Tips
        # Cooking Videos
        "https://www.youtube.com/watch?v=PUP7U5vTMM0"   # Gordon Ramsay Cooking
    ]

    # Known deepfake videos (clearly labeled as AI-generated/deepfake content)
    fake_video_urls = [
        # Celebrity Deepfakes
        "https://www.youtube.com/watch?v=cQ54GDm1eL0",  # Morgan Freeman Deepfake
        "https://www.youtube.com/watch?v=bPhUhypV27w",  # Tom Cruise Deepfake
        "https://www.youtube.com/watch?v=oxXpB9pSETo",  # Biden Deepfake
        # AI Generated Content
        "https://www.youtube.com/watch?v=o-YBDTqX_ZU",  # AI News Anchor
        "https://www.youtube.com/watch?v=AmUC4m6w1wo",  # AI Generated Speech
        # Educational Deepfake Examples
        "https://www.youtube.com/watch?v=C8FO0P2a3dA",  # Deepfake Detection
        "https://www.youtube.com/watch?v=T76bK2t2r8g",  # AI Voice Demo
        # Synthetic Media Demos
        "https://www.youtube.com/watch?v=u_sJ3HgKZpk",  # AI Dance Generation
        "https://www.youtube.com/watch?v=6h_BARSvBGw",  # Synthetic Media Example
        "https://www.youtube.com/watch?v=MVaMv6VzRyk"   # AI Video Generation
    ]

    # Real images from verified sources
    real_image_urls = [
        # Portrait Photography
        "https://upload.wikimedia.org/wikipedia/commons/2/23/Official_Presidential_portrait_of_Barack_Obama.jpg",
        "https://upload.wikimedia.org/wikipedia/commons/d/d3/Albert_Einstein_Head.jpg",
        "https://upload.wikimedia.org/wikipedia/commons/1/16/Official_Portrait_of_President_Reagan_1981.jpg",
        # Nature Photography
        "https://upload.wikimedia.org/wikipedia/commons/e/e3/Magnificent_CM_Leung.jpg",
        "https://upload.wikimedia.org/wikipedia/commons/3/36/Hopetoun_falls.jpg",
        # Photojournalism
        "https://upload.wikimedia.org/wikipedia/commons/5/5b/January_6_Electoral_College_Vote_Count.jpg",
        "https://upload.wikimedia.org/wikipedia/commons/b/b0/Celebrations_on_VJ_Day_in_Hawaii.jpg",
        # Sports Photography
        "https://upload.wikimedia.org/wikipedia/commons/8/8c/2021_US_Open_Tennis.jpg",
        "https://upload.wikimedia.org/wikipedia/commons/b/b3/2016_Summer_Olympics_opening_ceremony_1035321-olimpiadas_abertura-4066.jpg",
        # Architecture Photography
        "https://upload.wikimedia.org/wikipedia/commons/a/a5/Eiffel_Tower_March_2014.jpg"
    ]

    # AI-generated images
    fake_image_urls = [
        # ThisPersonDoesNotExist
        "https://thispersondoesnotexist.xyz/img/1.jpg",
        "https://thispersondoesnotexist.xyz/img/2.jpg",
        "https://thispersondoesnotexist.xyz/img/3.jpg",
        "https://thispersondoesnotexist.xyz/img/4.jpg",
        "https://thispersondoesnotexist.xyz/img/5.jpg",
        # Generated Art
        "https://storage.googleapis.com/ai_generated_images/art1.jpg",
        "https://storage.googleapis.com/ai_generated_images/art2.jpg",
        "https://storage.googleapis.com/ai_generated_images/art3.jpg",
        "https://storage.googleapis.com/ai_generated_images/art4.jpg",
        "https://storage.googleapis.com/ai_generated_images/art5.jpg"
    ]

    # Real audio samples
    real_audio_urls = [
        # Speech Recordings
        "https://www2.cs.uic.edu/~i101/SoundFiles/gettysburg.wav",
        "https://www2.cs.uic.edu/~i101/SoundFiles/speech.wav",
        "https://www2.cs.uic.edu/~i101/SoundFiles/introduction.wav",
        # Music Samples
        "https://www2.cs.uic.edu/~i101/SoundFiles/BabyElephantWalk60.wav",
        "https://www2.cs.uic.edu/~i101/SoundFiles/CantinaBand60.wav",
        "https://www2.cs.uic.edu/~i101/SoundFiles/PinkPanther30.wav",
        # Nature Sounds
        "https://www2.cs.uic.edu/~i101/SoundFiles/birds.wav",
        "https://www2.cs.uic.edu/~i101/SoundFiles/ocean.wav",
        # Instrument Samples
        "https://www2.cs.uic.edu/~i101/SoundFiles/piano.wav",
        "https://www2.cs.uic.edu/~i101/SoundFiles/guitar.wav"
    ]

    # Synthetic audio samples
    fake_audio_urls = [
        # AI Generated Speech
        "https://storage.googleapis.com/synthetic_speech/speech1.wav",
        "https://storage.googleapis.com/synthetic_speech/speech2.wav",
        "https://storage.googleapis.com/synthetic_speech/speech3.wav",
        "https://storage.googleapis.com/synthetic_speech/speech4.wav",
        "https://storage.googleapis.com/synthetic_speech/speech5.wav",
        # AI Music
        "https://storage.googleapis.com/ai_music/song1.wav",
        "https://storage.googleapis.com/ai_music/song2.wav",
        "https://storage.googleapis.com/ai_music/song3.wav",
        # Voice Cloning
        "https://storage.googleapis.com/voice_cloning/clone1.wav",
        "https://storage.googleapis.com/voice_cloning/clone2.wav"
    ]

    # Combine all URLs into the required dictionary structure
    urls = {
        'video': {
            'real': real_video_urls,
            'fake': fake_video_urls
        },
        'image': {
            'real': real_image_urls,
            'fake': fake_image_urls
        },
        'audio': {
            'real': real_audio_urls,
            'fake': fake_audio_urls
        }
    }

    return urls

def main():
    # Initialize the collector with expanded dataset settings
    collector = DeepfakeMediaCollector(
        max_samples=10,
        base_dir="./expanded_deepfake_dataset",
        timeout=180,
        max_workers=6
    )

    # Initialize URLs
    urls = initialize_urls()

    # Configure logging for YouTube downloads
    logging.getLogger('yt_dlp').setLevel(logging.WARNING)

    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            asyncio.ensure_future(run_collector(collector, urls))
        else:
            loop.run_until_complete(run_collector(collector, urls))
    except RuntimeError:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(run_collector(collector, urls))
    finally:
        try:
            loop.close()
        except Exception:
            pass

if __name__ == "__main__":
    main()