<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [1]:
# Install required dependencies
!pip install -q datasets pandas pillow tqdm huggingface_hub decord
!apt-get install -y ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [2]:
import os
import logging
from pathlib import Path
import pandas as pd
from datasets import load_dataset
import hashlib
import requests
from decord import VideoReader
from PIL import Image
import wave

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

In [4]:
class DeepfakeDataCollector:
    def __init__(self, base_dir: str, max_samples: int = 20):
        """
        Initialize the deepfake data collector.

        Args:
            base_dir: Base directory for storing downloaded data
            max_samples: Maximum number of samples per category (real/fake)
        """
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.metadata = []
        self.base_dir.mkdir(parents=True, exist_ok=True)

    def validate_media_file(self, file_path: Path, media_type: str) -> bool:
        """ Validate media file integrity. """
        try:
            if not file_path.exists():
                return False

            if media_type == "video":
                with VideoReader(str(file_path)) as vr:
                    return vr[0] is not None

            elif media_type == "image":
                with Image.open(file_path) as img:
                    img.verify()
                return True

            elif media_type == "audio":
                with wave.open(str(file_path), "rb") as audio:
                    return audio.getnframes() > 0

            return False
        except Exception as e:
            logger.warning(f"Validation failed for {file_path}: {str(e)}")
            return False

    def download_file(self, url: str, save_path: Path) -> bool:
        """ Download a file with proper error handling. """
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()

            with open(save_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            return True
        except Exception as e:
            logger.error(f"Download failed for {url}: {str(e)}")
            return False

    def process_dataset(self, dataset_name: str, category_name: str, file_ext: str, key: str) -> None:
        """ Process and download a limited number of samples from a dataset. """
        logger.info(f"Processing {category_name} dataset: {dataset_name}")

        try:
            dataset = load_dataset(dataset_name, split="train", streaming=True)
            save_dir = self.base_dir / category_name
            save_dir.mkdir(parents=True, exist_ok=True)

            sample_count = 0
            for sample in dataset:
                if sample_count >= self.max_samples:
                    break

                file_path = save_dir / f"{category_name}_{sample_count:03d}.{file_ext}"
                success = False

                # Handle both direct bytes and URLs
                if isinstance(sample[key], dict) and "bytes" in sample[key]:
                    with open(file_path, "wb") as f:
                        f.write(sample[key]["bytes"])
                    success = True
                elif isinstance(sample[key], str):
                    success = self.download_file(sample[key], file_path)

                if success and self.validate_media_file(file_path, category_name):
                    self.metadata.append({
                        "modality": category_name,
                        "filename": file_path.name,
                        "file_path": str(file_path),
                        "source_dataset": dataset_name,
                        "checksum": self._get_file_hash(file_path),
                    })
                    sample_count += 1

        except Exception as e:
            logger.error(f"Error processing dataset {dataset_name}: {str(e)}")

    def _get_file_hash(self, file_path: Path) -> str:
        """ Calculate SHA-256 hash of a file. """
        sha256_hash = hashlib.sha256()
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()

    def save_metadata(self) -> None:
        """ Save metadata to CSV file. """
        if self.metadata:
            metadata_path = self.base_dir / "metadata.csv"
            pd.DataFrame(self.metadata).to_csv(metadata_path, index=False)
            logger.info(f"Metadata saved to {metadata_path}")
        else:
            logger.warning("No metadata to save")

    def collect_datasets(self, dataset_configs):
        """ Collect multiple datasets based on configuration. """
        for config in dataset_configs:
            self.process_dataset(**config)
        self.save_metadata()

In [None]:
def main():
    dataset_configs = [
        {"dataset_name": "thenewsupercell/fixed-fakeavceleb", "category_name": "audio", "file_ext": "wav", "key": "audio"},
        {"dataset_name": "mkhLlamaLearn/dfdc", "category_name": "video", "file_ext": "mp4", "key": "video"},
        {"dataset_name": "taohu/faceforensics_h5", "category_name": "image", "file_ext": "jpg", "key": "image"},
    ]

    collector = DeepfakeDataCollector(base_dir=Path.home() / "Downloads" / "deepfake_dataset", max_samples=20)
    collector.collect_datasets(dataset_configs)


if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
