<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [33]:
!pip install -q datasets pandas pillow tqdm huggingface_hub

In [34]:
import os
import shutil
import pandas as pd
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm
from PIL import Image
from huggingface_hub import login
import getpass

In [35]:
# 🛑 Manually Enter Hugging Face Token
HF_TOKEN = getpass.getpass("Enter your Hugging Face token: ")
login(HF_TOKEN)  # Logs in to Hugging Face

Enter your Hugging Face token: ··········


In [36]:
# Set Base Directory
BASE_DIR = Path("./deepfake_test_dataset")
BASE_DIR.mkdir(parents=True, exist_ok=True)

# Metadata Storage
metadata = []

In [37]:
def download_images():
    """Downloads 20 images (10 real, 10 fake)"""
    print("\n📷 Downloading Image Data...")
    dataset = load_dataset("KobiTraber/Deepfake-Image-Detection", split="train", use_auth_token=True)
    image_dir = BASE_DIR / "images"

    for category, label in [("real", 0), ("fake", 1)]:
        (image_dir / category).mkdir(parents=True, exist_ok=True)
        samples = dataset.filter(lambda x: x["label"] == label).select(range(10))  # 10 images each

        for i, sample in enumerate(tqdm(samples, desc=f"Saving {category} images")):
            img = sample["image"]
            img_path = image_dir / category / f"{category}_{i:03d}.jpg"
            img.save(str(img_path))

            metadata.append({
                "modality": "image",
                "category": category,
                "filename": img_path.name,
                "file_path": str(img_path),
                "source_dataset": "Deepfake-Image-Detection"
            })

def download_videos():
    """Downloads 20 videos (10 real, 10 fake)"""
    print("\n🎥 Downloading Video Data...")
    dataset = load_dataset("dawn-dai/deepfake-video", split="train", use_auth_token=True)
    video_dir = BASE_DIR / "videos"

    for category, label in [("real", 0), ("fake", 1)]:
        (video_dir / category).mkdir(parents=True, exist_ok=True)
        samples = dataset.filter(lambda x: x["label"] == label).select(range(10))  # 10 videos each

        for i, sample in enumerate(tqdm(samples, desc=f"Saving {category} videos")):
            video_path = video_dir / category / f"{category}_{i:03d}.mp4"
            with open(video_path, "wb") as f:
                f.write(sample["video"])

            metadata.append({
                "modality": "video",
                "category": category,
                "filename": video_path.name,
                "file_path": str(video_path),
                "source_dataset": "deepfake-video"
            })

def download_audio():
    """Downloads 20 audio samples (10 real, 10 fake)"""
    print("\n🔊 Downloading Audio Data...")
    dataset = load_dataset("m4r4b0u/FakeAVCeleb", split="train", use_auth_token=True)
    audio_dir = BASE_DIR / "audio"

    for category, label in [("real", 0), ("fake", 1)]:
        (audio_dir / category).mkdir(parents=True, exist_ok=True)
        samples = dataset.filter(lambda x: x["label"] == label).select(range(10))  # 10 audio clips each

        for i, sample in enumerate(tqdm(samples, desc=f"Saving {category} audio")):
            audio_path = audio_dir / category / f"{category}_{i:03d}.wav"
            with open(audio_path, "wb") as f:
                f.write(sample["audio"]["bytes"])

            metadata.append({
                "modality": "audio",
                "category": category,
                "filename": audio_path.name,
                "file_path": str(audio_path),
                "source_dataset": "FakeAVCeleb"
            })

def save_metadata():
    """Saves metadata as CSV"""
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv(BASE_DIR / "metadata.csv", index=False)
    print("\n✅ Metadata saved:", BASE_DIR / "metadata.csv")

In [38]:
def main():
    try:
        download_images()
        download_videos()
        download_audio()
        save_metadata()

        print("\n🎉 **Dataset Collection Complete!** 🎉")
        print(f"📂 Data stored in: {BASE_DIR.absolute()}")

    except Exception as e:
        print(f"❌ Error: {str(e)}")

if __name__ == "__main__":
    main()


📷 Downloading Image Data...
❌ Error: Dataset 'KobiTraber/Deepfake-Image-Detection' doesn't exist on the Hub or cannot be accessed.
