<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [6]:
# Install required dependencies
%pip install -qU soundfile numpy datasets pandas pillow tqdm huggingface_hub decord

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.1/464.1 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
langchain 0.3.16 requires numpy<2,>=1.22.4; python_version < "3.12", but you have numpy 2.2.2 which is incompatible.
tensor

In [12]:
import os
import logging
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import decord
from decord import VideoReader
import hashlib
import requests
from PIL import Image
import warnings
import soundfile as sf
import numpy as np
import urllib.request
import zipfile
import gdown
import json

In [16]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [17]:
class DeepfakeMediaCollector:
    def __init__(self, base_dir: str = "./deepfake_dataset", max_samples: int = 20):
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.metadata = []
        self.temp_dir = self.base_dir / "temp"
        self.base_dir.mkdir(parents=True, exist_ok=True)
        self.temp_dir.mkdir(parents=True, exist_ok=True)

    def download_from_gdrive(self, file_id: str, output_path: Path) -> bool:
        """Download file from Google Drive"""
        try:
            url = f'https://drive.google.com/uc?id={file_id}'
            return gdown.download(url, str(output_path), quiet=False)
        except Exception as e:
            logger.error(f"Failed to download from Google Drive: {str(e)}")
            return False

    def download_from_url(self, url: str, output_path: Path) -> bool:
        """Download file from direct URL"""
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            total_size = int(response.headers.get('content-length', 0))

            with open(output_path, 'wb') as f, tqdm(
                desc=output_path.name,
                total=total_size,
                unit='iB',
                unit_scale=True
            ) as pbar:
                for chunk in response.iter_content(chunk_size=8192):
                    size = f.write(chunk)
                    pbar.update(size)
            return True
        except Exception as e:
            logger.error(f"Failed to download from URL: {str(e)}")
            return False

    def download_dfdc_sample(self):
        """Download sample from DeepFake Detection Challenge dataset"""
        logger.info("Downloading DFDC sample videos...")

        # Sample videos from DFDC preview set
        samples = {
            'real': 'https://github.com/ondyari/FaceForensics/raw/master/example_videos/original.mp4',
            'fake': 'https://github.com/ondyari/FaceForensics/raw/master/example_videos/manipulated.mp4'
        }

        for category, url in samples.items():
            save_dir = self.base_dir / 'video' / category
            save_dir.mkdir(parents=True, exist_ok=True)
            output_path = save_dir / f'dfdc_{category}_sample.mp4'

            if self.download_from_url(url, output_path):
                if self.validate_media_file(output_path, 'video'):
                    self.metadata.append({
                        'modality': 'video',
                        'category': category,
                        'filename': output_path.name,
                        'file_path': str(output_path),
                        'source': 'DFDC',
                        'manipulation': 'None' if category == 'real' else 'face_swap'
                    })

    def download_celeba_sample(self):
        """Download sample from CelebA dataset"""
        logger.info("Downloading CelebA sample images...")

        # Sample images from CelebA
        base_url = "https://mmlab.ie.cuhk.edu.hk/projects/CelebA/images"
        samples = {
            'real': [f"{base_url}/000001.jpg", f"{base_url}/000002.jpg"],
            'fake': []  # We'll generate manipulated versions
        }

        for category, urls in samples.items():
            save_dir = self.base_dir / 'image' / category
            save_dir.mkdir(parents=True, exist_ok=True)

            for idx, url in enumerate(urls):
                output_path = save_dir / f'celeba_{category}_{idx:03d}.jpg'

                if self.download_from_url(url, output_path):
                    if self.validate_media_file(output_path, 'image'):
                        self.metadata.append({
                            'modality': 'image',
                            'category': category,
                            'filename': output_path.name,
                            'file_path': str(output_path),
                            'source': 'CelebA',
                            'manipulation': 'None'
                        })

    def download_audio_deepfake_sample(self):
        """Download sample from Audio Deepfake Dataset"""
        logger.info("Downloading Audio Deepfake samples...")

        # Sample audio files (using Mozilla Common Voice samples for real)
        samples = {
            'real': 'https://github.com/mozilla/DeepSpeech/raw/master/data/smoke_test/smoke_test.wav',
            'fake': 'https://github.com/mozilla/DeepSpeech/raw/master/data/smoke_test/smoke_test_filtered.wav'
        }

        for category, url in samples.items():
            save_dir = self.base_dir / 'audio' / category
            save_dir.mkdir(parents=True, exist_ok=True)
            output_path = save_dir / f'audio_{category}_sample.wav'

            if self.download_from_url(url, output_path):
                if self.validate_media_file(output_path, 'audio'):
                    self.metadata.append({
                        'modality': 'audio',
                        'category': category,
                        'filename': output_path.name,
                        'file_path': str(output_path),
                        'source': 'AudioDeepfake',
                        'manipulation': 'None' if category == 'real' else 'voice_conversion'
                    })

    def validate_media_file(self, file_path: Path, media_type: str) -> bool:
        try:
            if not file_path.exists():
                return False

            if media_type == 'video':
                try:
                    with VideoReader(str(file_path)) as vr:
                        return vr[0] is not None
                except Exception as e:
                    logger.warning(f"Video validation failed: {str(e)}")
                    return False

            elif media_type == 'image':
                try:
                    with Image.open(file_path) as img:
                        img.verify()
                    return True
                except Exception as e:
                    logger.warning(f"Image validation failed: {str(e)}")
                    return False

            elif media_type == 'audio':
                try:
                    data, samplerate = sf.read(file_path)
                    return len(data) > 0 and samplerate > 0
                except Exception as e:
                    logger.warning(f"Audio validation failed: {str(e)}")
                    return False

            return False

        except Exception as e:
            logger.warning(f"Validation failed for {file_path}: {str(e)}")
            return False

    def save_metadata(self) -> None:
        if self.metadata:
            metadata_path = self.base_dir / "metadata.csv"
            pd.DataFrame(self.metadata).to_csv(metadata_path, index=False)
            logger.info(f"Metadata saved to {metadata_path}")

            # Save a summary
            summary = pd.DataFrame(self.metadata).groupby(['modality', 'category']).size()
            summary_path = self.base_dir / "summary.txt"
            with open(summary_path, 'w') as f:
                f.write("Dataset Summary:\n\n")
                f.write(str(summary))
        else:
            logger.warning("No metadata to save")

    def cleanup(self):
        """Clean up temporary files"""
        if self.temp_dir.exists():
            shutil.rmtree(self.temp_dir)
            logger.info("Cleaned up temporary files")

def main():
    # Initialize collector
    collector = DeepfakeMediaCollector(
        base_dir="./deepfake_dataset",
        max_samples=20
    )

    try:
        # Download samples from different sources
        collector.download_dfdc_sample()
        collector.download_celeba_sample()
        collector.download_audio_deepfake_sample()

        # Save metadata and summary
        collector.save_metadata()

    finally:
        # Clean up temporary files
        collector.cleanup()

In [18]:
if __name__ == "__main__":
    main()

ERROR:__main__:Failed to download from URL: 404 Client Error: Not Found for url: https://github.com/ondyari/FaceForensics/raw/master/example_videos/original.mp4
ERROR:__main__:Failed to download from URL: 404 Client Error: Not Found for url: https://github.com/ondyari/FaceForensics/raw/master/example_videos/manipulated.mp4
ERROR:__main__:Failed to download from URL: 404 Client Error: Not Found for url: https://mmlab.ie.cuhk.edu.hk/projects/CelebA/images/000001.jpg
ERROR:__main__:Failed to download from URL: 404 Client Error: Not Found for url: https://mmlab.ie.cuhk.edu.hk/projects/CelebA/images/000002.jpg
ERROR:__main__:Failed to download from URL: 404 Client Error: Not Found for url: https://github.com/mozilla/DeepSpeech/raw/master/data/smoke_test/smoke_test.wav
ERROR:__main__:Failed to download from URL: 404 Client Error: Not Found for url: https://github.com/mozilla/DeepSpeech/raw/master/data/smoke_test/smoke_test_filtered.wav


NameError: name 'shutil' is not defined