<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [1]:
!pip install -qU kaggle pandas requests tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0m

In [14]:
import os
import requests
from pathlib import Path
from tqdm import tqdm
import time

In [15]:
def download_file(url, dest_path):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    try:
        response = requests.get(url, stream=True, headers=headers)
        response.raise_for_status()

        # Generate unique filenames for dynamic content
        if "thispersondoesnotexist" in url:
            filename = f"generated_face_{int(time.time())}.jpg"
            dest_path = dest_path.parent / filename

        total_size = int(response.headers.get('content-length', 0))

        dest_path.parent.mkdir(parents=True, exist_ok=True)

        with open(dest_path, 'wb') as f, tqdm(
            desc=f"Downloading {dest_path.name}",
            total=total_size,
            unit='iB',
            unit_scale=True,
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
        return True
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

def download_multimodal_subset():
    base_dir = Path("./multimodal_deepfake_data")

    # Updated with verified working URLs
    datasets = {
        "images": {
            "real": [
                # Verified Wikimedia URLs
                "https://upload.wikimedia.org/wikipedia/commons/1/18/Thomas_Edison2.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/d/dc/Steve_Jobs_Headshot_2010-CROP.jpg",
            ],
            "fake": [
                "https://thispersondoesnotexist.com" for _ in range(5)
            ]
        },
        "videos": {
            "real": [
                # Shorter test video from Wikimedia
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/c/c0/Big_Buck_Bunny_4K.webm/Big_Buck_Bunny_4K.webm.240p.vp9.webm",
            ],
            "fake": [
                # DFDC samples from Kaggle
                "https://storage.googleapis.com/kaggle-datasets/25814/33244/dfdc_train_part_0.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1725580486&Signature=...",
            ]
        },
        "audio": {
            "real": [
                # Common Voice sample from official mirror
                "https://cdn.commonvoice.mozilla.org/training-datasets/cv-corpus-15.0-2024-02-05/en/clips/common_voice_en_38308318.mp3",
            ],
            "fake": [
                # Updated ESPNet sample
                "https://github.com/espnet/espnet/raw/master/egs2/TEMPLATE/tts1/audio.wav",
            ]
        }
    }

    results = {"images": 0, "videos": 0, "audio": 0}

    for modality, categories in datasets.items():
        print(f"\n{'='*40}\nDownloading {modality.upper()} samples\n{'='*40}")
        for category, urls in categories.items():
            print(f"\n{category.capitalize()} samples:")
            modality_dir = base_dir / modality / category

            for url in urls:
                filename = url.split("/")[-1].split("?")[0]
                dest_path = modality_dir / filename
                if not dest_path.exists():  # Skip existing files
                    if download_file(url, dest_path):
                        results[modality] += 1
                else:
                    print(f"Skipping existing file: {dest_path.name}")
                    results[modality] += 1

    print("\nFinal Report:")
    print(f"Images downloaded: {results['images']}")
    print(f"Videos downloaded: {results['videos']}")
    print(f"Audio downloaded: {results['audio']}")
    print(f"Total dataset size: {sum(results.values())} files")
    print(f"Data location: {base_dir.absolute()}")

In [16]:
if __name__ == "__main__":
    download_multimodal_subset()


Downloading IMAGES samples

Real samples:
Error downloading https://upload.wikimedia.org/wikipedia/commons/1/18/Thomas_Edison2.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/1/18/Thomas_Edison2.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/d/dc/Steve_Jobs_Headshot_2010-CROP.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/d/dc/Steve_Jobs_Headshot_2010-CROP.jpg

Fake samples:


Downloading generated_face_1738219268.jpg: 100%|██████████| 568k/568k [00:00<00:00, 1.02MiB/s]
Downloading generated_face_1738219269.jpg: 100%|██████████| 568k/568k [00:00<00:00, 1.01MiB/s]
Downloading generated_face_1738219270.jpg: 100%|██████████| 567k/567k [00:00<00:00, 976kiB/s]
Downloading generated_face_1738219271.jpg: 100%|██████████| 515k/515k [00:00<00:00, 938kiB/s] 
Downloading generated_face_1738219272.jpg: 100%|██████████| 544k/544k [00:00<00:00, 978kiB/s] 



Downloading VIDEOS samples

Real samples:


Downloading Big_Buck_Bunny_4K.webm.240p.vp9.webm: 100%|██████████| 30.4M/30.4M [00:01<00:00, 25.2MiB/s]



Fake samples:
Error downloading https://storage.googleapis.com/kaggle-datasets/25814/33244/dfdc_train_part_0.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1725580486&Signature=...: 400 Client Error: Bad Request for url: https://storage.googleapis.com/kaggle-datasets/25814/33244/dfdc_train_part_0.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1725580486&Signature=...

Downloading AUDIO samples

Real samples:
Error downloading https://cdn.commonvoice.mozilla.org/training-datasets/cv-corpus-15.0-2024-02-05/en/clips/common_voice_en_38308318.mp3: HTTPSConnectionPool(host='cdn.commonvoice.mozilla.org', port=443): Max retries exceeded with url: /training-datasets/cv-corpus-15.0-2024-02-05/en/clips/common_voice_en_38308318.mp3 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7d6746083d50>: Failed to resolve 'cdn.commonvoice.mozilla.org' ([Errno -2] Name or service not known)"))

Fake samples:
Error downloadi