<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [18]:
# Install required dependencies
!pip install -q datasets pandas pillow tqdm huggingface_hub decord
!apt-get install -y ffmpeg  # Install FFmpeg for video processing

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [24]:
import os
import logging
from pathlib import Path
from typing import List, Dict, Optional
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm
import decord
from decord import VideoReader
import warnings
import hashlib
from concurrent.futures import ThreadPoolExecutor
from huggingface_hub import login
import requests

In [25]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [26]:
class DeepfakeDataCollector:
    def __init__(self, base_dir: str, max_samples: int = 20):
        """
        Initialize the deepfake data collector.

        Args:
            base_dir: Base directory for storing downloaded data
            max_samples: Maximum number of samples per category (real/fake)
        """
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.metadata = []
        self.base_dir.mkdir(parents=True, exist_ok=True)

    def validate_media_file(self, file_path: Path, media_type: str) -> bool:
        """
        Validate downloaded media files.

        Args:
            file_path: Path to the media file
            media_type: Type of media ('audio', 'video', or 'image')

        Returns:
            bool: True if file is valid, False otherwise
        """
        try:
            if not file_path.exists():
                return False

            if media_type == 'video':
                with VideoReader(str(file_path)) as vr:
                    # Check if we can read at least one frame
                    frame = vr[0]
                    return frame is not None

            elif media_type == 'image':
                from PIL import Image
                with Image.open(file_path) as img:
                    img.verify()
                return True

            elif media_type == 'audio':
                import wave
                with wave.open(str(file_path), 'rb') as audio:
                    return audio.getnframes() > 0

            return False

        except Exception as e:
            logger.warning(f"Validation failed for {file_path}: {str(e)}")
            return False

    def download_file(self, url: str, save_path: Path) -> bool:
        """
        Download a file with proper error handling and verification.

        Args:
            url: URL to download from
            save_path: Path to save the file

        Returns:
            bool: True if download successful, False otherwise
        """
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()

            # Save the file
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            return True

        except Exception as e:
            logger.error(f"Download failed for {url}: {str(e)}")
            return False

    def process_dataset(self,
                       dataset_name: str,
                       category_name: str,
                       file_ext: str,
                       key: str) -> None:
        """
        Process and download a specific dataset.

        Args:
            dataset_name: Name of the dataset on Hugging Face
            category_name: Category name ('audio', 'video', 'image')
            file_ext: File extension
            key: Key for accessing media in dataset
        """
        logger.info(f"Processing {category_name} dataset: {dataset_name}")

        try:
            dataset = load_dataset(dataset_name, split="train", streaming=True)
            save_dir = self.base_dir / category_name

            for category in ['real', 'fake']:
                category_dir = save_dir / category
                category_dir.mkdir(parents=True, exist_ok=True)

                sample_count = 0
                for sample in dataset:
                    if sample['label'] == (1 if category == 'fake' else 0):
                        if sample_count >= self.max_samples:
                            break

                        file_path = category_dir / f"{category}_{sample_count:03d}.{file_ext}"

                        # Handle both URL and bytes data
                        success = False
                        if isinstance(sample[key], dict) and 'bytes' in sample[key]:
                            with open(file_path, 'wb') as f:
                                f.write(sample[key]['bytes'])
                            success = True
                        elif isinstance(sample[key], str):
                            success = self.download_file(sample[key], file_path)

                        if success and self.validate_media_file(file_path, category_name):
                            self.metadata.append({
                                'modality': category_name,
                                'category': category,
                                'filename': file_path.name,
                                'file_path': str(file_path),
                                'source_dataset': dataset_name,
                                'checksum': self._get_file_hash(file_path)
                            })
                            sample_count += 1

        except Exception as e:
            logger.error(f"Error processing dataset {dataset_name}: {str(e)}")

    def _get_file_hash(self, file_path: Path) -> str:
        """Calculate SHA-256 hash of a file."""
        sha256_hash = hashlib.sha256()
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()

    def save_metadata(self) -> None:
        """Save metadata to CSV file."""
        if self.metadata:
            metadata_path = self.base_dir / "metadata.csv"
            pd.DataFrame(self.metadata).to_csv(metadata_path, index=False)
            logger.info(f"Metadata saved to {metadata_path}")
        else:
            logger.warning("No metadata to save")

    def collect_datasets(self, dataset_configs: List[Dict]) -> None:
        """
        Collect multiple datasets based on configuration.

        Args:
            dataset_configs: List of dataset configurations
        """
        for config in dataset_configs:
            self.process_dataset(**config)
        self.save_metadata()

In [None]:
def main():
    # Dataset configurations
    dataset_configs = [
        {
            "dataset_name": "thenewsupercell/fixed-fakeavceleb",
            "category_name": "audio",
            "file_ext": "wav",
            "key": "audio"
        },
        {
            "dataset_name": "mkhLlamaLearn/dfdc",
            "category_name": "videos",
            "file_ext": "mp4",
            "key": "video"
        },
        {
            "dataset_name": "taohu/faceforensics_h5",
            "category_name": "images",
            "file_ext": "jpg",
            "key": "image"
        }
    ]

    # Initialize collector
    collector = DeepfakeDataCollector(
        base_dir=Path.home() / "Downloads" / "deepfake_dataset",
        max_samples=20
    )

    # Collect datasets
    collector.collect_datasets(dataset_configs)

if __name__ == "__main__":
    main()

ERROR:__main__:Error processing dataset thenewsupercell/fixed-fakeavceleb: Error reading /kaggle/working/new_video_dataset/train/FakeVideo-FakeAudio/00001_0_id06152_wavtolip.mp4...


Resolving data files:   0%|          | 0/2100 [00:00<?, ?it/s]