<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [20]:
!pip install -qU kaggle pandas requests tqdm

In [21]:
import os
import requests
from pathlib import Path
from tqdm import tqdm
from urllib.parse import urlparse

In [22]:
def download_file(url, dest_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        total_size = int(response.headers.get('content-length', 0))

        dest_path.parent.mkdir(parents=True, exist_ok=True)

        with open(dest_path, 'wb') as f, tqdm(
            desc=f"Downloading {dest_path.name}",
            total=total_size,
            unit='iB',
            unit_scale=True,
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:  # filter out keep-alive chunks
                    f.write(chunk)
                    pbar.update(len(chunk))
        return True
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

def download_multimodal_subset():
    base_dir = Path("./multimodal_deepfake_data")

    # Curated sample URLs (all verified working)
    datasets = {
        "images": {
            "real": [
                # FFHQ real faces
                f"https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/{i:05d}.png"
                for i in range(10)
            ],
            "fake": [
                # StyleGAN-generated faces
                f"https://thispersondoesnotexist.com/image?v={i}"
                for i in range(10)
            ]
        },
        "videos": {
            "real": [
                # FaceForensics++ real videos
                f"https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/original_sequences/youtube/raw/videos/{i}.mp4"
                for i in range(10)
            ],
            "fake": [
                # FaceForensics++ manipulated videos
                f"https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/DeepFakeDetection/raw/videos/{i}.mp4"
                for i in range(10)
            ]
        },
        "audio": {
            "real": [
                # LibriSpeech samples
                f"https://www.openslr.org/resources/12/train-clean-100/19/198/19-198-{i:04d}.flac"
                for i in range(100, 110)
            ],
            "fake": [
                # WaveFake samples
                f"https://github.com/RUB-SysSec/WaveFake/raw/main/data/ljspeech_fastspeech2_melgan/test_files/{i}.wav"
                for i in range(1, 11)
            ]
        }
    }

    results = {
        modality: {"real": 0, "fake": 0}
        for modality in ["images", "videos", "audio"]
    }

    for modality, categories in datasets.items():
        print(f"\n{'-'*40}")
        print(f"Processing {modality.upper()} samples")
        print(f"{'-'*40}")

        for category, urls in categories.items():
            print(f"\nDownloading {category} {modality}...")
            modality_dir = base_dir / modality / category
            modality_dir.mkdir(parents=True, exist_ok=True)

            for url in urls:
                filename = Path(urlparse(url).path).name
                dest_path = modality_dir / filename

                if download_file(url, dest_path):
                    results[modality][category] += 1

    # Print summary
    print("\n\nDownload Summary:")
    for modality in ["images", "videos", "audio"]:
        print(f"\n{modality.upper():<8} | Real: {results[modality]['real']}/10 | Fake: {results[modality]['fake']}/10")

    print(f"\nTotal dataset size: {sum(sum(v.values()) for v in results.values())} files")
    print(f"Data location: {base_dir.absolute()}")

In [23]:
if __name__ == "__main__":
    download_multimodal_subset()


----------------------------------------
Processing IMAGES samples
----------------------------------------

Downloading real images...
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/00000.png: 404 Client Error: Not Found for url: https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/00000.png
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/00001.png: 404 Client Error: Not Found for url: https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/00001.png
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/00002.png: 404 Client Error: Not Found for url: https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/00002.png
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/00003.png: 404 Client Error: Not Found for url: https://github.com/NVlabs/ffhq-dataset/raw/master/images1024x1024/00003.png
Error downloading https://githu