<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [None]:
# Install required dependencies
%pip install -qU soundfile numpy datasets pandas pillow tqdm huggingface_hub decord

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m99.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.1/464.1 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m108.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [14]:
import os
import logging
import shutil
import json
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import decord
from decord import VideoReader
import hashlib
import aiohttp
import asyncio
from PIL import Image
import soundfile as sf
from typing import Dict, List, Optional, Tuple

In [15]:
# Initialize logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [16]:
class DeepfakeMediaCollector:
    def __init__(
        self,
        base_dir: str = "./deepfake_dataset",
        max_samples: int = 5,
        max_retries: int = 3,
        timeout: int = 30,
        max_workers: int = 4
    ):
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.max_retries = max_retries
        self.timeout = timeout
        self.max_workers = max_workers
        self.metadata: List[Dict] = []
        self.temp_dir = self.base_dir / "temp"
        self._create_directory_structure()

    def _create_directory_structure(self):
        """Create the necessary directory structure with error handling"""
        try:
            for dir_type in ['video', 'image', 'audio']:
                for category in ['real', 'fake']:
                    dir_path = self.base_dir / dir_type / category
                    dir_path.mkdir(parents=True, exist_ok=True)
            self.temp_dir.mkdir(parents=True, exist_ok=True)
        except Exception as e:
            logger.error(f"Failed to create directory structure: {str(e)}")
            raise

    async def download_file(self, url: str, output_path: Path) -> bool:
        """Download a single file with retry logic and progress bar"""
        for attempt in range(self.max_retries):
            try:
                async with aiohttp.ClientSession() as session:
                    async with session.get(url, timeout=self.timeout) as response:
                        if response.status != 200:
                            logger.warning(f"Attempt {attempt + 1} failed: HTTP {response.status}")
                            continue

                        total_size = int(response.headers.get('content-length', 0))

                        with open(output_path, 'wb') as f, tqdm(
                            desc=output_path.name,
                            total=total_size,
                            unit='iB',
                            unit_scale=True
                        ) as pbar:
                            async for chunk in response.content.iter_chunked(8192):
                                size = f.write(chunk)
                                pbar.update(size)

                        return True

            except asyncio.TimeoutError:
                logger.warning(f"Timeout on attempt {attempt + 1} for {url}")
            except Exception as e:
                logger.error(f"Download failed on attempt {attempt + 1} for {url}: {str(e)}")
                if output_path.exists():
                    output_path.unlink()

            await asyncio.sleep(2 ** attempt)  # Exponential backoff

        return False

    def validate_media_file(self, file_path: Path, media_type: str) -> Tuple[bool, Optional[str]]:
        """Validate downloaded media files"""
        if not file_path.exists():
            return False, "File does not exist"

        if file_path.stat().st_size == 0:
            return False, "File is empty"

        try:
            if media_type == 'video':
                vr = VideoReader(str(file_path))
                if len(vr) == 0:
                    return False, "Video has no frames"
                _ = vr[0].asnumpy()
                return True, None

            elif media_type == 'image':
                with Image.open(file_path) as img:
                    img.verify()
                    return True, None

            elif media_type == 'audio':
                data, _ = sf.read(file_path)
                if len(data) == 0:
                    return False, "Audio file is empty"
                return True, None

        except Exception as e:
            return False, str(e)

    def add_to_metadata(self, modality: str, category: str, file_path: Path):
        """Add file metadata to collection"""
        try:
            with open(file_path, 'rb') as f:
                content = f.read()
                md5_hash = hashlib.md5(content).hexdigest()
                sha256_hash = hashlib.sha256(content).hexdigest()

            file_stats = file_path.stat()
            metadata_entry = {
                'modality': modality,
                'category': category,
                'filename': file_path.name,
                'file_path': str(file_path),
                'file_size': file_stats.st_size,
                'md5_hash': md5_hash,
                'sha256_hash': sha256_hash,
                'creation_time': file_stats.st_ctime,
                'modification_time': file_stats.st_mtime
            }

            # Add modality-specific metadata
            if modality == 'video':
                vr = VideoReader(str(file_path))
                metadata_entry.update({
                    'frame_count': len(vr),
                    'width': vr[0].shape[1],
                    'height': vr[0].shape[0],
                })
            elif modality == 'image':
                with Image.open(file_path) as img:
                    metadata_entry.update({
                        'width': img.width,
                        'height': img.height,
                        'mode': img.mode,
                        'format': img.format,
                    })
            elif modality == 'audio':
                data, samplerate = sf.read(file_path)
                metadata_entry.update({
                    'duration': len(data) / samplerate,
                    'samplerate': samplerate,
                    'channels': data.shape[1] if len(data.shape) > 1 else 1,
                })

            self.metadata.append(metadata_entry)

        except Exception as e:
            logger.error(f"Failed to add metadata for {file_path}: {str(e)}")

    async def process_url(self, url: str, category: str, media_type: str):
        """Process a single URL: download, validate, and add metadata"""
        output_path = self.base_dir / media_type / category / Path(url).name

        if await self.download_file(url, output_path):
            is_valid, error_msg = self.validate_media_file(output_path, media_type)
            if is_valid:
                self.add_to_metadata(media_type, category, output_path)
            else:
                logger.error(f"Invalid {media_type} file {output_path}: {error_msg}")
                output_path.unlink(missing_ok=True)

    async def process_urls(self, urls_dict: Dict[str, List[str]], media_type: str):
        """Process a batch of URLs for a specific media type"""
        tasks = []
        for category, urls in urls_dict.items():
            for url in urls[:self.max_samples]:
                tasks.append(self.process_url(url, category, media_type))
        await asyncio.gather(*tasks)

    def save_metadata(self):
        """Save metadata and generate summary"""
        if not self.metadata:
            logger.warning("No metadata to save")
            return

        try:
            metadata_df = pd.DataFrame(self.metadata)
            metadata_df.to_csv(self.base_dir / "metadata.csv", index=False)

            summary = {
                'total_files': len(metadata_df),
                'total_size_mb': metadata_df['file_size'].sum() / (1024 * 1024),
                'by_modality': metadata_df.groupby('modality')['filename'].count().to_dict(),
                'by_category': metadata_df.groupby('category')['filename'].count().to_dict()
            }

            with open(self.base_dir / "summary.json", 'w') as f:
                json.dump(summary, f, indent=2)

            logger.info(f"Metadata and summary saved to {self.base_dir}")

        except Exception as e:
            logger.error(f"Failed to save metadata: {str(e)}")

    def cleanup(self):
        """Clean up temporary files"""
        try:
            if self.temp_dir.exists():
                shutil.rmtree(self.temp_dir)
            logger.info("Cleanup completed successfully")
        except Exception as e:
            logger.error(f"Cleanup failed: {str(e)}")

async def run_collector(collector: DeepfakeMediaCollector, urls: Dict[str, Dict[str, List[str]]]):
    """Run the collector with proper async handling"""
    for media_type, urls_dict in urls.items():
        await collector.process_urls(urls_dict, media_type)
    collector.save_metadata()
    collector.cleanup()

def main():
    """
    Main function that handles both Jupyter notebook and regular Python environments
    """
    # Example usage
    urls = {
        'video': {
            'real': [
                'https://github.com/ondyari/FaceForensics/raw/master/dataset/videos/real/000_003.mp4',
                'https://github.com/ondyari/FaceForensics/raw/master/dataset/videos/real/001_009.mp4',
                'https://paperswithcode.com/datasets?task=deepfake-detection',
                'https://github.com/Daisy-Zhang/Awesome-Deepfakes-Detection',
                'https://www.researchgate.net/publication/382316749_Video_and_Audio_Deepfake_Datasets_and_Open_Issues_in_Deepfake_Technology_Being_Ahead_of_the_Curve',
                'https://github.com/Daisy-Zhang/Awesome-Deepfakes',
                'https://github.com/DASH-Lab/FakeAVCeleb',
                'https://www.kaggle.com/competitions/deepfake-detection-challenge',
                'https://www.kaggle.com/datasets/abdallamohamed312/in-the-wild-audio-deepfake'
            ],
            'fake': [
                'https://github.com/ondyari/FaceForensics/raw/master/dataset/videos/fake/000_003.mp4',
                'https://github.com/ondyari/FaceForensics/raw/master/dataset/videos/fake/001_009.mp4'
            ]
        },
        'image': {
            'real': [
                'https://github.com/danmohami/celeb-df/raw/master/dataset/images/real/000001.png',
                'https://github.com/danmohami/celeb-df/raw/master/dataset/images/real/000002.png'
            ],
            'fake': [
                'https://github.com/danmohami/celeb-df/raw/master/dataset/images/fake/000001.png',
                'https://github.com/danmohami/celeb-df/raw/master/dataset/images/fake/000002.png'
            ]
        },
        'audio': {
            'real': [
                'https://github.com/DariusAf/FakeAVCeleb/raw/master/dataset/audio/real/000001.wav',
                'https://github.com/DariusAf/FakeAVCeleb/raw/master/dataset/audio/real/000002.wav'
            ],
            'fake': [
                'https://github.com/DariusAf/FakeAVCeleb/raw/master/dataset/audio/fake/000001.wav',
                'https://github.com/DariusAf/FakeAVCeleb/raw/master/dataset/audio/fake/000002.wav'
            ]
        }
    }

    collector = DeepfakeMediaCollector()

    try:
        # Try to get the current event loop
        loop = asyncio.get_event_loop()
        if loop.is_running():
            # We're in a Jupyter notebook with a running event loop
            # Use asyncio.ensure_future() instead
            asyncio.ensure_future(run_collector(collector, urls))
        else:
            # We're in a regular Python environment
            loop.run_until_complete(run_collector(collector, urls))
    except RuntimeError:
        # No event loop exists yet
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(run_collector(collector, urls))
    finally:
        try:
            loop.close()
        except:
            pass

In [17]:
# For Jupyter notebook usage, you can also use this helper function
async def run_in_notebook(urls_dict):
    """
    Helper function for running the collector in a Jupyter notebook
    Usage:
    await run_in_notebook(urls_dict)
    """
    collector = DeepfakeMediaCollector()
    await run_collector(collector, urls_dict)

if __name__ == "__main__":
    main()

