<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [6]:
# Install required dependencies
%pip install -qU soundfile numpy datasets pandas pillow tqdm huggingface_hub decord

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.1/464.1 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
langchain 0.3.16 requires numpy<2,>=1.22.4; python_version < "3.12", but you have numpy 2.2.2 which is incompatible.
tensor

In [12]:
import os
import logging
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import decord
from decord import VideoReader
import hashlib
import requests
from PIL import Image
import warnings
import soundfile as sf
import numpy as np
import urllib.request
import zipfile
import gdown
import json

In [13]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [14]:
class DeepfakeMediaCollector:
    def __init__(self, base_dir: str, max_samples: int = 20):
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.metadata = []
        self.base_dir.mkdir(parents=True, exist_ok=True)

    def download_file(self, url: str, save_path: Path, gdrive: bool = False) -> bool:
        try:
            save_path.parent.mkdir(parents=True, exist_ok=True)

            if gdrive:
                return gdown.download(url, str(save_path), quiet=False)
            else:
                with requests.get(url, stream=True) as response:
                    response.raise_for_status()
                    total_size = int(response.headers.get('content-length', 0))

                    with open(save_path, 'wb') as f, tqdm(
                        desc=save_path.name,
                        total=total_size,
                        unit='iB',
                        unit_scale=True
                    ) as pbar:
                        for chunk in response.iter_content(chunk_size=8192):
                            size = f.write(chunk)
                            pbar.update(size)
                return True

        except Exception as e:
            logger.error(f"Download failed for {url}: {str(e)}")
            if save_path.exists():
                save_path.unlink()
            return False

    def validate_media_file(self, file_path: Path, media_type: str) -> bool:
        try:
            if not file_path.exists():
                return False

            if media_type == 'video':
                try:
                    with VideoReader(str(file_path)) as vr:
                        return vr[0] is not None
                except Exception as e:
                    logger.warning(f"Video validation failed: {str(e)}")
                    return False

            elif media_type == 'image':
                try:
                    with Image.open(file_path) as img:
                        img.verify()
                    return True
                except Exception as e:
                    logger.warning(f"Image validation failed: {str(e)}")
                    return False

            elif media_type == 'audio':
                try:
                    data, samplerate = sf.read(file_path)
                    return len(data) > 0 and samplerate > 0
                except Exception as e:
                    logger.warning(f"Audio validation failed: {str(e)}")
                    return False

            return False

        except Exception as e:
            logger.warning(f"Validation failed for {file_path}: {str(e)}")
            return False

    def process_ff_dataset(self, source_dir: Path):
        """Process FaceForensics++ dataset structure"""
        logger.info("Processing FaceForensics++ dataset")

        categories = {
            'original': 'real',
            'DeepFakeDetection': 'fake',
            'Face2Face': 'fake',
            'FaceSwap': 'fake',
            'Deepfakes': 'fake',
            'NeuralTextures': 'fake'
        }

        for src_category, dst_category in categories.items():
            src_path = source_dir / src_category / 'c23' / 'videos'
            if not src_path.exists():
                continue

            for video_file in src_path.glob('*.mp4'):
                if len([x for x in self.metadata if x['category'] == dst_category]) >= self.max_samples:
                    break

                dst_path = self.base_dir / 'video' / dst_category / video_file.name
                dst_path.parent.mkdir(parents=True, exist_ok=True)

                try:
                    if not dst_path.exists():
                        dst_path.write_bytes(video_file.read_bytes())

                    if self.validate_media_file(dst_path, 'video'):
                        self.metadata.append({
                            'modality': 'video',
                            'category': dst_category,
                            'filename': dst_path.name,
                            'file_path': str(dst_path),
                            'source': 'FaceForensics++',
                            'manipulation': src_category
                        })
                except Exception as e:
                    logger.error(f"Error processing {video_file}: {str(e)}")

    def process_celeb_df(self, source_dir: Path):
        """Process Celeb-DF dataset structure"""
        logger.info("Processing Celeb-DF dataset")

        real_path = source_dir / 'Celeb-real'
        fake_path = source_dir / 'Celeb-synthesis'

        for category, path in [('real', real_path), ('fake', fake_path)]:
            if not path.exists():
                continue

            for video_file in path.glob('*.mp4'):
                if len([x for x in self.metadata if x['category'] == category]) >= self.max_samples:
                    break

                dst_path = self.base_dir / 'video' / category / video_file.name
                dst_path.parent.mkdir(parents=True, exist_ok=True)

                try:
                    if not dst_path.exists():
                        dst_path.write_bytes(video_file.read_bytes())

                    if self.validate_media_file(dst_path, 'video'):
                        self.metadata.append({
                            'modality': 'video',
                            'category': category,
                            'filename': dst_path.name,
                            'file_path': str(dst_path),
                            'source': 'Celeb-DF',
                            'manipulation': 'None' if category == 'real' else 'face_swap'
                        })
                except Exception as e:
                    logger.error(f"Error processing {video_file}: {str(e)}")

    def process_wav2lip(self, source_dir: Path):
        """Process Wav2Lip dataset structure"""
        logger.info("Processing Wav2Lip dataset")

        categories = {
            'real_videos': 'real',
            'fake_videos': 'fake'
        }

        for src_category, dst_category in categories.items():
            src_path = source_dir / src_category
            if not src_path.exists():
                continue

            for video_file in src_path.glob('*.mp4'):
                if len([x for x in self.metadata if x['category'] == dst_category]) >= self.max_samples:
                    break

                dst_path = self.base_dir / 'video' / dst_category / video_file.name
                dst_path.parent.mkdir(parents=True, exist_ok=True)

                try:
                    if not dst_path.exists():
                        dst_path.write_bytes(video_file.read_bytes())

                    if self.validate_media_file(dst_path, 'video'):
                        self.metadata.append({
                            'modality': 'video',
                            'category': dst_category,
                            'filename': dst_path.name,
                            'file_path': str(dst_path),
                            'source': 'Wav2Lip',
                            'manipulation': 'None' if dst_category == 'real' else 'lip_sync'
                        })
                except Exception as e:
                    logger.error(f"Error processing {video_file}: {str(e)}")

    def save_metadata(self) -> None:
        if self.metadata:
            metadata_path = self.base_dir / "metadata.csv"
            pd.DataFrame(self.metadata).to_csv(metadata_path, index=False)
            logger.info(f"Metadata saved to {metadata_path}")
        else:
            logger.warning("No metadata to save")

def main():
    # Initialize collector
    collector = DeepfakeMediaCollector(
        base_dir="./deepfake_dataset",
        max_samples=20
    )

    # Process datasets from local directories (if available)
    datasets = {
        'faceforensics': Path('/path/to/faceforensics'),
        'celeb_df': Path('/path/to/celeb-df'),
        'wav2lip': Path('/path/to/wav2lip')
    }

    for dataset_name, dataset_path in datasets.items():
        if dataset_path.exists():
            if dataset_name == 'faceforensics':
                collector.process_ff_dataset(dataset_path)
            elif dataset_name == 'celeb_df':
                collector.process_celeb_df(dataset_path)
            elif dataset_name == 'wav2lip':
                collector.process_wav2lip(dataset_path)

    # Save metadata
    collector.save_metadata()

In [15]:
if __name__ == "__main__":
    main()

