<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [29]:
!pip install -q datasets pandas pillow tqdm

In [30]:
import os
from pathlib import Path
import pandas as pd
from datasets import load_dataset
import shutil
from tqdm import tqdm
import numpy as np
from PIL import Image
import io

In [31]:
def download_subset():
    base_dir = Path("./deepfake_test_dataset")
    base_dir.mkdir(parents=True, exist_ok=True)

    metadata = []

    # 1. Images - Using "DuckBill/DFImages" dataset
    print("\nProcessing Images...")
    image_dataset = load_dataset("DuckBill/DFImages", split="train")

    image_dir = base_dir / "images"
    for category in ['real', 'fake']:
        (image_dir / category).mkdir(parents=True, exist_ok=True)

        # Filter and select samples
        category_samples = image_dataset.filter(lambda x: x['label'] == (0 if category == 'real' else 1))
        selected_samples = category_samples.select(range(5))

        for i, sample in enumerate(selected_samples):
            img = sample['image']
            img_path = image_dir / category / f"{category}_{i:03d}.jpg"
            img.save(str(img_path))

            metadata.append({
                'modality': 'image',
                'category': category,
                'filename': img_path.name,
                'file_path': str(img_path),
                'source_dataset': 'DFImages'
            })

    # 2. Videos - Using "dawn-dai/deepfake-video" dataset
    print("\nProcessing Videos...")
    video_dataset = load_dataset("dawn-dai/deepfake-video", split="train")

    video_dir = base_dir / "videos"
    for category in ['real', 'fake']:
        (video_dir / category).mkdir(parents=True, exist_ok=True)

        # Filter and select samples
        category_samples = video_dataset.filter(lambda x: x['label'] == (0 if category == 'real' else 1))
        selected_samples = category_samples.select(range(5))

        for i, sample in enumerate(selected_samples):
            video_path = video_dir / category / f"{category}_{i:03d}.mp4"
            with open(video_path, 'wb') as f:
                f.write(sample['video'])

            metadata.append({
                'modality': 'video',
                'category': category,
                'filename': video_path.name,
                'file_path': str(video_path),
                'source_dataset': 'deepfake-video'
            })

    # 3. Audio - Using "spraakbanken/wav2vec2-large-voxrex-swedish" for real samples
    # and generating fake ones with different voice models
    print("\nProcessing Audio...")
    audio_dataset = load_dataset("spraakbanken/wav2vec2-large-voxrex-swedish", split="train")

    audio_dir = base_dir / "audio"
    for category in ['real', 'fake']:
        (audio_dir / category).mkdir(parents=True, exist_ok=True)

        # For real audio, use genuine speech samples
        if category == 'real':
            selected_samples = audio_dataset.select(range(5))
            for i, sample in enumerate(selected_samples):
                audio_path = audio_dir / category / f"{category}_{i:03d}.wav"
                with open(audio_path, 'wb') as f:
                    f.write(sample['audio']['bytes'])

                metadata.append({
                    'modality': 'audio',
                    'category': 'real',
                    'filename': audio_path.name,
                    'file_path': str(audio_path),
                    'source_dataset': 'wav2vec2-large-voxrex-swedish'
                })

        # For fake audio, use the FakeYou dataset
        else:
            fake_audio_dataset = load_dataset("google/fakeyou", split="train")
            selected_samples = fake_audio_dataset.select(range(5))
            for i, sample in enumerate(selected_samples):
                audio_path = audio_dir / category / f"{category}_{i:03d}.wav"
                with open(audio_path, 'wb') as f:
                    f.write(sample['audio']['bytes'])

                metadata.append({
                    'modality': 'audio',
                    'category': 'fake',
                    'filename': audio_path.name,
                    'file_path': str(audio_path),
                    'source_dataset': 'fakeyou'
                })

    # Save metadata
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv(base_dir / 'metadata.csv', index=False)

    # Print summary
    print("\nDataset Collection Summary:")
    summary = metadata_df.groupby(['modality', 'category']).size().unstack(fill_value=0)
    print(summary)
    print(f"\nDataset location: {base_dir.absolute()}")
    print(f"Metadata file: {base_dir/'metadata.csv'}")

    return base_dir, metadata_df

In [32]:
if __name__ == "__main__":
    import datasets
    datasets.logging.set_verbosity_error()  # Reduce logging noise

    try:
        base_dir, metadata = download_subset()
    except Exception as e:
        print(f"Error: {str(e)}")


Processing Images...
Error: Dataset 'DuckBill/DFImages' doesn't exist on the Hub or cannot be accessed.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
