<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [17]:
!pip install -qU kaggle pandas requests tqdm

In [18]:
import os
import requests
from pathlib import Path
from tqdm import tqdm
import time

In [19]:
def download_file(url, dest_path):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    try:
        response = requests.get(url, stream=True, headers=headers)
        response.raise_for_status()

        if "thispersondoesnotexist" in url:
            filename = f"generated_face_{int(time.time())}.jpg"
            dest_path = dest_path.parent / filename

        total_size = int(response.headers.get('content-length', 0))

        dest_path.parent.mkdir(parents=True, exist_ok=True)

        with open(dest_path, 'wb') as f, tqdm(
            desc=f"Downloading {dest_path.name}",
            total=total_size,
            unit='iB',
            unit_scale=True,
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
        return True
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

def download_multimodal_subset():
    base_dir = Path("./multimodal_deepfake_data")

    datasets = {
        "images": {
            "real": [
                # Verified Wikimedia portrait images
                "https://upload.wikimedia.org/wikipedia/commons/3/3a/Marcus_Aurelius_Louvre.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/4/49/Elon_Musk_2015.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/a/a8/Bill_Gates_2018.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/6/6d/Greta_Thunberg_2019.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/d/d9/Margaret_Hamilton_1995.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/7/79/Taylor_Swift_2007.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/8/8d/Barack_Obama_2012.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/e/eb/Malala_Yousafzai_2015.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/6/69/Emma_Watson_2013.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/5/5d/Angela_Merkel_2014.jpg"
            ],
            "fake": ["https://thispersondoesnotexist.com" for _ in range(10)]
        },
        "videos": {
            "real": [
                # Short videos from Wikimedia Commons
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/4/47/George_W_Bush_Announces_Start_of_Iraq_War.ogv/George_W_Bush_Announces_Start_of_Iraq_War.ogv.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/2/2f/BBC_News_Countdown.ogv/BBC_News_Countdown.ogv.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/7/73/Neutral_News.ogv/Neutral_News.ogv.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/0/0f/Obama_speaks_on_NSA_reforms.ogv/Obama_speaks_on_NSA_reforms.ogv.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/3/34/Scene_from_Shakespeares_The_Tempest.ogv/Scene_from_Shakespeares_The_Tempest.ogv.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/6/61/Tech_News_CNN.ogv/Tech_News_CNN.ogv.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/5/54/Truman.announcestheatomicbomb.ogv/Truman.announcestheatomicbomb.ogv.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/a/a4/WikiNews-20070503.ogv/WikiNews-20070503.ogv.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/8/8b/En-uk_hello.ogg/En-uk_hello.ogg.240p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/f/f9/Glow2.ogv/Glow2.ogv.240p.vp9.webm"
            ],
            "fake": [
                # Deepfake samples from FaceForensics++
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/000_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/001_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/002_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/003_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/004_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/005_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/006_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/007_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/008_003.mp4",
                "https://github.com/ondyari/FaceForensics/raw/master/dataset/samples/manipulated_sequences/Deepfakes/c23/videos/009_003.mp4"
            ]
        },
        "audio": {
            "real": [
                # Audio samples from Freesound (CC licensed)
                "https://freesound.org/data/previews/511/511919_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511912_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511908_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511907_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511906_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511905_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511903_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511902_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511901_919548-lq.mp3",
                "https://freesound.org/data/previews/511/511900_919548-lq.mp3"
            ],
            "fake": [
                # Synthetic audio from ESPNet
                "https://github.com/espnet/espnet/raw/master/egs2/ljspeech/tts1/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/vctk/tts1/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/libritts/tts1/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/jsut/tts1/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/aishell3/tts1/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/csmsc/tts1/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/jvs/tts1/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/naturalspeech/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/arctic/tts1/audio.wav",
                "https://github.com/espnet/espnet/raw/master/egs2/biaobei/tts1/audio.wav"
            ]
        }
    }

    results = {"images": 0, "videos": 0, "audio": 0}

    for modality, categories in datasets.items():
        print(f"\n{'='*40}\nDownloading {modality.upper()} samples\n{'='*40}")
        for category, urls in categories.items():
            print(f"\n{category.capitalize()} samples:")
            modality_dir = base_dir / modality / category

            for url in urls:
                filename = url.split("/")[-1].split("?")[0]
                dest_path = modality_dir / filename
                if not dest_path.exists():
                    if download_file(url, dest_path):
                        results[modality] += 1
                else:
                    print(f"Skipping existing file: {dest_path.name}")
                    results[modality] += 1

    print("\nFinal Report:")
    print(f"Images downloaded: {results['images']}")
    print(f"Videos downloaded: {results['videos']}")
    print(f"Audio downloaded: {results['audio']}")
    print(f"Total dataset size: {sum(results.values())} files")
    print(f"Data location: {base_dir.absolute()}")

In [20]:
if __name__ == "__main__":
    download_multimodal_subset()


Downloading IMAGES samples

Real samples:
Error downloading https://upload.wikimedia.org/wikipedia/commons/3/3a/Marcus_Aurelius_Louvre.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/3/3a/Marcus_Aurelius_Louvre.jpg


Downloading Elon_Musk_2015.jpg: 100%|██████████| 406k/406k [00:00<00:00, 2.17MiB/s]


Error downloading https://upload.wikimedia.org/wikipedia/commons/a/a8/Bill_Gates_2018.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/a/a8/Bill_Gates_2018.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/6/6d/Greta_Thunberg_2019.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/6/6d/Greta_Thunberg_2019.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/d/d9/Margaret_Hamilton_1995.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/d/d9/Margaret_Hamilton_1995.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/7/79/Taylor_Swift_2007.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/7/79/Taylor_Swift_2007.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/8/8d/Barack_Obama_2012.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commo

Downloading generated_face_1738220709.jpg: 100%|██████████| 555k/555k [00:00<00:00, 1.00MiB/s]
Downloading generated_face_1738220710.jpg: 100%|██████████| 516k/516k [00:00<00:00, 994kiB/s] 
Downloading generated_face_1738220711.jpg: 100%|██████████| 547k/547k [00:00<00:00, 1.00MiB/s]
Downloading generated_face_1738220712.jpg: 100%|██████████| 588k/588k [00:00<00:00, 1.13MiB/s]
Downloading generated_face_1738220714.jpg: 100%|██████████| 561k/561k [00:00<00:00, 1.06MiB/s]
Downloading generated_face_1738220715.jpg: 100%|██████████| 530k/530k [00:00<00:00, 953kiB/s] 
Downloading generated_face_1738220716.jpg: 100%|██████████| 595k/595k [00:00<00:00, 1.12MiB/s]
Downloading generated_face_1738220717.jpg: 100%|██████████| 514k/514k [00:00<00:00, 968kiB/s] 
Downloading generated_face_1738220718.jpg: 100%|██████████| 577k/577k [00:00<00:00, 1.06MiB/s]
Downloading generated_face_1738220719.jpg: 100%|██████████| 525k/525k [00:00<00:00, 1.01MiB/s]



Downloading VIDEOS samples

Real samples:
Error downloading https://upload.wikimedia.org/wikipedia/commons/transcoded/4/47/George_W_Bush_Announces_Start_of_Iraq_War.ogv/George_W_Bush_Announces_Start_of_Iraq_War.ogv.240p.vp9.webm: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/transcoded/4/47/George_W_Bush_Announces_Start_of_Iraq_War.ogv/George_W_Bush_Announces_Start_of_Iraq_War.ogv.240p.vp9.webm
Error downloading https://upload.wikimedia.org/wikipedia/commons/transcoded/2/2f/BBC_News_Countdown.ogv/BBC_News_Countdown.ogv.240p.vp9.webm: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/transcoded/2/2f/BBC_News_Countdown.ogv/BBC_News_Countdown.ogv.240p.vp9.webm
Error downloading https://upload.wikimedia.org/wikipedia/commons/transcoded/7/73/Neutral_News.ogv/Neutral_News.ogv.240p.vp9.webm: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/transcoded/7/73/Neutral_News.ogv/Neutral_News.o