<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [1]:
# Install required dependencies
%pip install -qU soundfile numpy datasets pandas pillow tqdm huggingface_hub decord

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m99.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.1/464.1 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m108.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import logging
import shutil
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import decord
from decord import VideoReader
import hashlib
import requests
from PIL import Image
import soundfile as sf
import numpy as np
from typing import Dict, List, Optional, Tuple
import aiohttp
import asyncio
from concurrent.futures import ThreadPoolExecutor

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [4]:
class DeepfakeMediaCollector:
    def __init__(
        self,
        base_dir: str = "./deepfake_dataset",
        max_samples: int = 20,
        max_retries: int = 3,
        timeout: int = 30,
        max_workers: int = 4
    ):
        """
        Initialize the DeepfakeMediaCollector with improved configuration options.

        Args:
            base_dir (str): Base directory for storing downloaded files
            max_samples (int): Maximum number of samples to collect per category
            max_retries (int): Maximum number of download retry attempts
            timeout (int): Download timeout in seconds
            max_workers (int): Maximum number of concurrent download workers
        """
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.max_retries = max_retries
        self.timeout = timeout
        self.max_workers = max_workers
        self.metadata: List[Dict] = []
        self.temp_dir = self.base_dir / "temp"

        # Create directories with error handling
        self._create_directory_structure()

        # Initialize session for connection pooling
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def _create_directory_structure(self):
        """Create the necessary directory structure with error handling"""
        try:
            for dir_type in ['video', 'image', 'audio']:
                for category in ['real', 'fake']:
                    dir_path = self.base_dir / dir_type / category
                    dir_path.mkdir(parents=True, exist_ok=True)
            self.temp_dir.mkdir(parents=True, exist_ok=True)
        except Exception as e:
            logger.error(f"Failed to create directory structure: {str(e)}")
            raise

    async def download_with_progress(self, url: str, output_path: Path) -> bool:
        """
        Download file with progress bar using aiohttp for better async support.

        Args:
            url (str): URL to download from
            output_path (Path): Where to save the file

        Returns:
            bool: True if download successful, False otherwise
        """
        for attempt in range(self.max_retries):
            try:
                async with aiohttp.ClientSession() as session:
                    async with session.get(url, timeout=self.timeout) as response:
                        if response.status != 200:
                            logger.warning(f"Attempt {attempt + 1} failed: HTTP {response.status}")
                            continue

                        total_size = int(response.headers.get('content-length', 0))

                        with open(output_path, 'wb') as f, tqdm(
                            desc=output_path.name,
                            total=total_size,
                            unit='iB',
                            unit_scale=True
                        ) as pbar:
                            async for chunk in response.content.iter_chunked(8192):
                                size = f.write(chunk)
                                pbar.update(size)

                        return True

            except asyncio.TimeoutError:
                logger.warning(f"Timeout on attempt {attempt + 1} for {url}")
            except Exception as e:
                logger.error(f"Download failed on attempt {attempt + 1} for {url}: {str(e)}")
                if output_path.exists():
                    output_path.unlink()

            # Add exponential backoff between retries
            await asyncio.sleep(2 ** attempt)

        return False

    def validate_media_file(self, file_path: Path, media_type: str) -> Tuple[bool, Optional[str]]:
        """
        Validate media files with improved error handling and metadata extraction.

        Args:
            file_path (Path): Path to media file
            media_type (str): Type of media ('video', 'image', or 'audio')

        Returns:
            Tuple[bool, Optional[str]]: (is_valid, error_message)
        """
        if not file_path.exists():
            return False, "File does not exist"

        if file_path.stat().st_size == 0:
            return False, "File is empty"

        try:
            if media_type == 'video':
                vr = VideoReader(str(file_path))
                frame_count = len(vr)
                if frame_count == 0:
                    return False, "Video has no frames"
                # Test first frame access
                _ = vr[0].asnumpy()
                return True, None

            elif media_type == 'image':
                with Image.open(file_path) as img:
                    img.verify()
                    # Extract basic image metadata
                    return True, None

            elif media_type == 'audio':
                data, samplerate = sf.read(file_path)
                if len(data) == 0:
                    return False, "Audio file is empty"
                return True, None

        except Exception as e:
            return False, str(e)

    def add_to_metadata(self, modality: str, category: str, file_path: Path):
        """Add entry to metadata with enhanced file analysis"""
        try:
            # Calculate both MD5 and SHA256 hashes
            with open(file_path, 'rb') as f:
                content = f.read()
                md5_hash = hashlib.md5(content).hexdigest()
                sha256_hash = hashlib.sha256(content).hexdigest()

            file_stats = file_path.stat()

            metadata_entry = {
                'modality': modality,
                'category': category,
                'filename': file_path.name,
                'file_path': str(file_path),
                'file_size': file_stats.st_size,
                'md5_hash': md5_hash,
                'sha256_hash': sha256_hash,
                'creation_time': file_stats.st_ctime,
                'modification_time': file_stats.st_mtime,
                'manipulation': 'None' if category == 'real' else 'unknown'
            }

            # Add modality-specific metadata
            if modality == 'video':
                vr = VideoReader(str(file_path))
                metadata_entry.update({
                    'frame_count': len(vr),
                    'width': vr[0].shape[1],
                    'height': vr[0].shape[0],
                })
            elif modality == 'image':
                with Image.open(file_path) as img:
                    metadata_entry.update({
                        'width': img.width,
                        'height': img.height,
                        'mode': img.mode,
                        'format': img.format,
                    })
            elif modality == 'audio':
                data, samplerate = sf.read(file_path)
                metadata_entry.update({
                    'duration': len(data) / samplerate,
                    'samplerate': samplerate,
                    'channels': data.shape[1] if len(data.shape) > 1 else 1,
                })

            self.metadata.append(metadata_entry)

        except Exception as e:
            logger.error(f"Failed to add metadata for {file_path}: {str(e)}")

    def save_metadata(self):
        """Save metadata with enhanced reporting"""
        if not self.metadata:
            logger.warning("No metadata to save")
            return

        try:
            # Save detailed CSV
            metadata_df = pd.DataFrame(self.metadata)
            metadata_path = self.base_dir / "metadata.csv"
            metadata_df.to_csv(metadata_path, index=False)

            # Generate comprehensive summary
            summary = {
                'total_files': len(metadata_df),
                'total_size_mb': metadata_df['file_size'].sum() / (1024 * 1024),
                'by_modality': metadata_df.groupby('modality').agg({
                    'filename': 'count',
                    'file_size': ['sum', 'mean', 'min', 'max']
                }).round(2),
                'by_category': metadata_df.groupby('category').agg({
                    'filename': 'count',
                    'file_size': ['sum', 'mean']
                }).round(2)
            }

            # Save detailed summary report
            summary_path = self.base_dir / "summary.txt"
            with open(summary_path, 'w') as f:
                f.write("Dataset Summary Report\n")
                f.write("=" * 50 + "\n\n")

                f.write("Overall Statistics:\n")
                f.write(f"Total files: {summary['total_files']}\n")
                f.write(f"Total size: {summary['total_size_mb']:.2f} MB\n\n")

                f.write("By Modality:\n")
                f.write(str(summary['by_modality']))
                f.write("\n\nBy Category:\n")
                f.write(str(summary['by_category']))

            # Save JSON version for programmatic access
            with open(self.base_dir / "summary.json", 'w') as f:
                json.dump(summary, f, indent=2, default=str)

            logger.info(f"Metadata and summaries saved to {self.base_dir}")

        except Exception as e:
            logger.error(f"Failed to save metadata: {str(e)}")

    def cleanup(self):
        """Clean up temporary files and resources"""
        try:
            if self.temp_dir.exists():
                shutil.rmtree(self.temp_dir)

            # Close the requests session
            self.session.close()

            logger.info("Cleanup completed successfully")

        except Exception as e:
            logger.error(f"Cleanup failed: {str(e)}")

    async def async_download_all(self):
        """Download all samples asynchronously"""
        tasks = []

        for url_dict in [self.video_urls, self.image_urls, self.audio_urls]:
            for category, urls in url_dict.items():
                for url in urls:
                    output_path = self._get_output_path(url, category)
                    tasks.append(self.download_with_progress(url, output_path))

        results = await asyncio.gather(*tasks)
        return results

def main():
    try:
        collector = DeepfakeMediaCollector()

        # Run async downloads
        asyncio.run(collector.async_download_all())

        # Save metadata and cleanup
        collector.save_metadata()
        collector.cleanup()

    except Exception as e:
        logger.error(f"An error occurred in main: {str(e)}")
        raise

In [5]:
if __name__ == "__main__":
    main()

ERROR:__main__:An error occurred in main: asyncio.run() cannot be called from a running event loop


RuntimeError: asyncio.run() cannot be called from a running event loop