<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [1]:
# Install required dependencies
!pip install -q datasets pandas pillow tqdm huggingface_hub decord
!apt-get install -y ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [2]:
import os
import logging
from pathlib import Path
from typing import List, Dict, Optional
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm
import decord
from decord import VideoReader
import warnings
import hashlib
import requests
from huggingface_hub import login

In [3]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [4]:
class DeepfakeDataCollector:
    def __init__(self, base_dir: str, max_samples: int = 20):
        """
        Initialize the deepfake data collector.

        Args:
            base_dir: Base directory for storing downloaded data
            max_samples: Maximum number of samples per category (real/fake)
        """
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.metadata = []
        self.base_dir.mkdir(parents=True, exist_ok=True)

    def validate_media_file(self, file_path: Path, media_type: str) -> bool:
        """
        Validate downloaded media files.
        """
        try:
            if not file_path.exists():
                return False

            if media_type == 'video':
                # Add error handling for video validation
                try:
                    with VideoReader(str(file_path)) as vr:
                        frame = vr[0]
                        return frame is not None
                except Exception as e:
                    logger.warning(f"Video validation failed: {str(e)}")
                    return False

            elif media_type == 'image':
                from PIL import Image
                try:
                    with Image.open(file_path) as img:
                        img.verify()
                    return True
                except Exception as e:
                    logger.warning(f"Image validation failed: {str(e)}")
                    return False

            elif media_type == 'audio':
                import wave
                try:
                    with wave.open(str(file_path), 'rb') as audio:
                        return audio.getnframes() > 0
                except Exception as e:
                    logger.warning(f"Audio validation failed: {str(e)}")
                    return False

            return False

        except Exception as e:
            logger.warning(f"Validation failed for {file_path}: {str(e)}")
            return False

    def download_file(self, url: str, save_path: Path) -> bool:
        """
        Download a file with proper error handling and verification.
        """
        try:
            # Add timeout and headers
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, stream=True, timeout=30, headers=headers)
            response.raise_for_status()

            # Create parent directories if they don't exist
            save_path.parent.mkdir(parents=True, exist_ok=True)

            # Save the file with progress bar
            total_size = int(response.headers.get('content-length', 0))
            block_size = 8192

            with open(save_path, 'wb') as f:
                with tqdm(total=total_size, unit='iB', unit_scale=True) as pbar:
                    for chunk in response.iter_content(chunk_size=block_size):
                        f.write(chunk)
                        pbar.update(len(chunk))

            return True

        except Exception as e:
            logger.error(f"Download failed for {url}: {str(e)}")
            if save_path.exists():
                save_path.unlink()  # Delete partial downloads
            return False

    def process_dataset(self, dataset_name: str, category_name: str, file_ext: str, key: str) -> None:
        """
        Process and download a specific dataset.
        """
        logger.info(f"Processing {category_name} dataset: {dataset_name}")

        try:
            # Add streaming=True and split="train" options
            dataset = load_dataset(dataset_name, split="train", streaming=True)
            save_dir = self.base_dir / category_name

            for category in ['real', 'fake']:
                category_dir = save_dir / category
                category_dir.mkdir(parents=True, exist_ok=True)

                sample_count = 0
                for sample in dataset:
                    try:
                        if sample['label'] == (1 if category == 'fake' else 0):
                            if sample_count >= self.max_samples:
                                break

                            file_path = category_dir / f"{category}_{sample_count:03d}.{file_ext}"

                            success = False
                            if isinstance(sample.get(key), dict) and 'bytes' in sample[key]:
                                with open(file_path, 'wb') as f:
                                    f.write(sample[key]['bytes'])
                                success = True
                            elif isinstance(sample.get(key), str):
                                success = self.download_file(sample[key], file_path)
                            else:
                                logger.warning(f"Unsupported data format for key '{key}' in sample")
                                continue

                            if success and self.validate_media_file(file_path, category_name):
                                self.metadata.append({
                                    'modality': category_name,
                                    'category': category,
                                    'filename': file_path.name,
                                    'file_path': str(file_path),
                                    'source_dataset': dataset_name,
                                    'checksum': self._get_file_hash(file_path)
                                })
                                sample_count += 1

                    except Exception as e:
                        logger.warning(f"Error processing sample: {str(e)}")
                        continue

        except Exception as e:
            logger.error(f"Error processing dataset {dataset_name}: {str(e)}")

    def _get_file_hash(self, file_path: Path) -> str:
        """Calculate SHA-256 hash of a file."""
        sha256_hash = hashlib.sha256()
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()

    def save_metadata(self) -> None:
        """Save metadata to CSV file."""
        if self.metadata:
            metadata_path = self.base_dir / "metadata.csv"
            pd.DataFrame(self.metadata).to_csv(metadata_path, index=False)
            logger.info(f"Metadata saved to {metadata_path}")
        else:
            logger.warning("No metadata to save")

    def collect_datasets(self, dataset_configs: List[Dict]) -> None:
        """
        Collect multiple datasets based on configuration.
        """
        for config in dataset_configs:
            self.process_dataset(**config)
        self.save_metadata()

def main():
    # Install required dependencies first
    !pip install -q datasets pandas pillow tqdm huggingface_hub decord
    !apt-get install -y ffmpeg

    # Dataset configurations - Updated with more reliable datasets
    dataset_configs = [
        {
            "dataset_name": "arnabpai/dfdc-videos",  # Alternative to mkhLlamaLearn/dfdc
            "category_name": "videos",
            "file_ext": "mp4",
            "key": "video"
        },
        {
            "dataset_name": "jonathansmith/image-deepfakes",  # Alternative to taohu/faceforensics_h5
            "category_name": "images",
            "file_ext": "jpg",
            "key": "image"
        }
    ]

    # Initialize collector with Colab-specific path
    collector = DeepfakeDataCollector(
        base_dir="/content/deepfake_dataset",
        max_samples=20
    )

    # Collect datasets
    collector.collect_datasets(dataset_configs)

In [5]:
if __name__ == "__main__":
    main()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
ERROR:__main__:Error processing dataset arnabpai/dfdc-videos: Dataset 'arnabpai/dfdc-videos' doesn't exist on the Hub or cannot be accessed.
ERROR:__main__:Error processing dataset jonathansmith/image-deepfakes: Dataset 'jonathansmith/image-deepfakes' doesn't exist on the Hub or cannot be accessed.
