<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [None]:
# Install required dependencies
%pip install -qU soundfile numpy datasets pandas pillow tqdm huggingface_hub decord

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.1/464.1 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
langchain 0.3.16 requires numpy<2,>=1.22.4; python_version < "3.12", but you have numpy 2.2.2 which is incompatible.
tensor

In [23]:
import os
import logging
import shutil
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import decord
from decord import VideoReader
import hashlib
import requests
from PIL import Image
import soundfile as sf
import numpy as np
import urllib.request
import zipfile
import json

In [24]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [25]:
class DeepfakeMediaCollector:
    def __init__(self, base_dir: str = "./deepfake_dataset", max_samples: int = 20):
        """
        Initialize the DeepfakeMediaCollector.

        Args:
            base_dir (str): Base directory for storing downloaded files
            max_samples (int): Maximum number of samples to collect per category
        """
        self.base_dir = Path(base_dir)
        self.max_samples = max_samples
        self.metadata = []
        self.temp_dir = self.base_dir / "temp"

        # Create directories
        for dir_type in ['video', 'image', 'audio']:
            for category in ['real', 'fake']:
                (self.base_dir / dir_type / category).mkdir(parents=True, exist_ok=True)
        self.temp_dir.mkdir(parents=True, exist_ok=True)

    def download_with_progress(self, url: str, output_path: Path) -> bool:
        """
        Download file with progress bar and robust error handling.

        Args:
            url (str): URL to download from
            output_path (Path): Where to save the file

        Returns:
            bool: True if download successful, False otherwise
        """
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, stream=True, headers=headers, timeout=30)
            response.raise_for_status()

            total_size = int(response.headers.get('content-length', 0))
            block_size = 8192

            with open(output_path, 'wb') as f, tqdm(
                desc=output_path.name,
                total=total_size,
                unit='iB',
                unit_scale=True
            ) as pbar:
                for chunk in response.iter_content(chunk_size=block_size):
                    size = f.write(chunk)
                    pbar.update(size)
            return True

        except requests.RequestException as e:
            logger.error(f"Download failed for {url}: {str(e)}")
            if output_path.exists():
                output_path.unlink()
            return False

    def download_sample_video(self):
        """Download sample videos from more reliable sources"""
        logger.info("Downloading sample videos...")

        # Using Creative Commons videos as examples
        samples = {
            'real': 'https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/360/Big_Buck_Bunny_360_10s_1MB.mp4',
            'fake': 'https://storage.googleapis.com/deepfake-database/manipulated_videos/sample1.mp4'
        }

        for category, url in samples.items():
            save_dir = self.base_dir / 'video' / category
            output_path = save_dir / f'sample_{category}.mp4'

            if self.download_with_progress(url, output_path):
                if self.validate_media_file(output_path, 'video'):
                    self.add_to_metadata('video', category, output_path)

    def download_sample_images(self):
        """Download sample images from reliable sources"""
        logger.info("Downloading sample images...")

        # Using Creative Commons images as examples
        samples = {
            'real': [
                'https://upload.wikimedia.org/wikipedia/commons/e/ec/Person_icon.png',
                'https://upload.wikimedia.org/wikipedia/commons/7/7e/Circle-icons-profile.svg'
            ],
            'fake': []  # We'll generate manipulated versions later
        }

        for category, urls in samples.items():
            save_dir = self.base_dir / 'image' / category

            for idx, url in enumerate(urls):
                output_path = save_dir / f'sample_{category}_{idx}.jpg'

                if self.download_with_progress(url, output_path):
                    if self.validate_media_file(output_path, 'image'):
                        self.add_to_metadata('image', category, output_path)

    def download_sample_audio(self):
        """Download sample audio from reliable sources"""
        logger.info("Downloading sample audio...")

        # Using Creative Commons audio as examples
        samples = {
            'real': 'https://www2.cs.uic.edu/~i101/SoundFiles/BabyElephantWalk60.wav',
            'fake': 'https://www2.cs.uic.edu/~i101/SoundFiles/CantinaBand60.wav'
        }

        for category, url in samples.items():
            save_dir = self.base_dir / 'audio' / category
            output_path = save_dir / f'sample_{category}.wav'

            if self.download_with_progress(url, output_path):
                if self.validate_media_file(output_path, 'audio'):
                    self.add_to_metadata('audio', category, output_path)

    def validate_media_file(self, file_path: Path, media_type: str) -> bool:
        """
        Validate media files with improved error handling.

        Args:
            file_path (Path): Path to media file
            media_type (str): Type of media ('video', 'image', or 'audio')

        Returns:
            bool: True if file is valid, False otherwise
        """
        if not file_path.exists() or file_path.stat().st_size == 0:
            return False

        try:
            if media_type == 'video':
                with VideoReader(str(file_path)) as vr:
                    frame_count = len(vr)
                    return frame_count > 0 and vr[0] is not None

            elif media_type == 'image':
                with Image.open(file_path) as img:
                    img.verify()
                    return True

            elif media_type == 'audio':
                data, samplerate = sf.read(file_path)
                return len(data) > 0 and samplerate > 0

        except Exception as e:
            logger.warning(f"Validation failed for {file_path}: {str(e)}")
            return False

        return False

    def add_to_metadata(self, modality: str, category: str, file_path: Path):
        """Add entry to metadata with file hash"""
        try:
            with open(file_path, 'rb') as f:
                file_hash = hashlib.md5(f.read()).hexdigest()

            self.metadata.append({
                'modality': modality,
                'category': category,
                'filename': file_path.name,
                'file_path': str(file_path),
                'file_hash': file_hash,
                'file_size': file_path.stat().st_size,
                'manipulation': 'None' if category == 'real' else 'unknown'
            })
        except Exception as e:
            logger.error(f"Failed to add metadata for {file_path}: {str(e)}")

    def save_metadata(self):
        """Save metadata with improved formatting"""
        if not self.metadata:
            logger.warning("No metadata to save")
            return

        try:
            # Save detailed CSV
            metadata_df = pd.DataFrame(self.metadata)
            metadata_path = self.base_dir / "metadata.csv"
            metadata_df.to_csv(metadata_path, index=False)

            # Save summary report
            summary = metadata_df.groupby(['modality', 'category']).agg({
                'filename': 'count',
                'file_size': ['sum', 'mean']
            }).round(2)

            summary_path = self.base_dir / "summary.txt"
            with open(summary_path, 'w') as f:
                f.write("Dataset Summary\n")
                f.write("=" * 50 + "\n\n")
                f.write(str(summary))
                f.write("\n\nTotal files: {}\n".format(len(metadata_df)))
                f.write("Total size: {:.2f} MB".format(metadata_df['file_size'].sum() / (1024 * 1024)))

            logger.info(f"Metadata saved to {metadata_path}")
            logger.info(f"Summary saved to {summary_path}")

        except Exception as e:
            logger.error(f"Failed to save metadata: {str(e)}")

    def cleanup(self):
        """Clean up temporary files"""
        try:
            if self.temp_dir.exists():
                shutil.rmtree(self.temp_dir)
                logger.info("Cleaned up temporary files")
        except Exception as e:
            logger.error(f"Cleanup failed: {str(e)}")

def main():
    try:
        collector = DeepfakeMediaCollector()

        # Download samples
        collector.download_sample_video()
        collector.download_sample_images()
        collector.download_sample_audio()

        # Save metadata and cleanup
        collector.save_metadata()
        collector.cleanup()

    except Exception as e:
        logger.error(f"An error occurred in main: {str(e)}")

In [None]:
if __name__ == "__main__":
    main()

sample_real.mp4:   0%|          | 0.00/991k [00:00<?, ?iB/s]

ERROR:__main__:Download failed for https://storage.googleapis.com/deepfake-database/manipulated_videos/sample1.mp4: 404 Client Error: Not Found for url: https://storage.googleapis.com/deepfake-database/manipulated_videos/sample1.mp4
ERROR:__main__:Download failed for https://upload.wikimedia.org/wikipedia/commons/e/ec/Person_icon.png: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/e/ec/Person_icon.png


sample_real_1.jpg:   0%|          | 0.00/668 [00:00<?, ?iB/s]

