<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [1]:
!pip install -qU kaggle pandas requests tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0m

In [2]:
import os
import requests
from pathlib import Path
from tqdm import tqdm

In [3]:
def download_file(url, dest_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        total_size = int(response.headers.get('content-length', 0))

        dest_path.parent.mkdir(parents=True, exist_ok=True)

        with open(dest_path, 'wb') as f, tqdm(
            desc=f"Downloading {dest_path.name}",
            total=total_size,
            unit='iB',
            unit_scale=True,
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
        return True
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

def download_multimodal_subset():
    base_dir = Path("./multimodal_deepfake_data")

    # Updated URLs using more stable sources
    datasets = {
        "images": {
            "real": [
                # Using Flickr API for real face images
                "https://live.staticflickr.com/65535/faces/sample_{}.jpg".format(i)
                for i in range(1, 11)
            ],
            "fake": [
                # Using ThisPersonDoesNotExist API for generated faces
                "https://thispersondoesnotexist.com/image"
                for _ in range(10)
            ]
        },
        "videos": {
            "real": [
                # Using Creative Commons videos from Archive.org
                f"https://archive.org/download/facial_video_sample_{i}/video.mp4"
                for i in range(1, 11)
            ],
            "fake": [
                # Using DeepFake detection challenge sample videos
                f"https://ai.facebook.com/datasets/dfdc/samples/fake_{i}.mp4"
                for i in range(1, 11)
            ]
        },
        "audio": {
            "real": [
                # Using Mozilla Common Voice samples
                f"https://commonvoice.mozilla.org/data/sample_{i}.wav"
                for i in range(1, 11)
            ],
            "fake": [
                # Using synthetic voice samples
                f"https://sample-videos.com/audio/synthetic/voice_{i}.wav"
                for i in range(1, 11)
            ]
        }
    }

    results = {"images": 0, "videos": 0, "audio": 0}

    for modality, categories in datasets.items():
        print(f"\n{'='*40}\nDownloading {modality.upper()} samples\n{'='*40}")
        for category, urls in categories.items():
            print(f"\n{category.capitalize()} samples:")
            modality_dir = base_dir / modality / category

            # Special handling for ThisPersonDoesNotExist
            if category == "fake" and modality == "images":
                for i in range(10):
                    dest_path = modality_dir / f"generated_face_{i}.jpg"
                    if download_file(urls[0], dest_path):
                        results[modality] += 1
            else:
                for url in urls:
                    filename = url.split("/")[-1]
                    dest_path = modality_dir / filename
                    if download_file(url, dest_path):
                        results[modality] += 1

    print("\nFinal Report:")
    print(f"Images downloaded: {results['images']}")
    print(f"Videos downloaded: {results['videos']}")
    print(f"Audio downloaded: {results['audio']}")
    print(f"Total dataset size: {sum(results.values())} files")
    print(f"Data location: {base_dir.absolute()}")

In [4]:
if __name__ == "__main__":
    download_multimodal_subset()


Downloading IMAGES samples

Real samples:
Error downloading https://live.staticflickr.com/65535/faces/sample_1.jpg: 404 Client Error: Not Found for url: https://live.staticflickr.com/65535/faces/sample_1.jpg
Error downloading https://live.staticflickr.com/65535/faces/sample_2.jpg: 404 Client Error: Not Found for url: https://live.staticflickr.com/65535/faces/sample_2.jpg
Error downloading https://live.staticflickr.com/65535/faces/sample_3.jpg: 404 Client Error: Not Found for url: https://live.staticflickr.com/65535/faces/sample_3.jpg
Error downloading https://live.staticflickr.com/65535/faces/sample_4.jpg: 404 Client Error: Not Found for url: https://live.staticflickr.com/65535/faces/sample_4.jpg
Error downloading https://live.staticflickr.com/65535/faces/sample_5.jpg: 404 Client Error: Not Found for url: https://live.staticflickr.com/65535/faces/sample_5.jpg
Error downloading https://live.staticflickr.com/65535/faces/sample_6.jpg: 404 Client Error: Not Found for url: https://live.sta

Downloading fake_1.mp4: 179kiB [00:00, 8.21MiB/s]
Downloading fake_2.mp4: 179kiB [00:00, 7.67MiB/s]
Downloading fake_3.mp4: 179kiB [00:00, 7.46MiB/s]
Downloading fake_4.mp4: 179kiB [00:00, 7.37MiB/s]
Downloading fake_5.mp4: 179kiB [00:00, 7.58MiB/s]
Downloading fake_6.mp4: 179kiB [00:00, 8.16MiB/s]
Downloading fake_7.mp4: 179kiB [00:00, 3.49MiB/s]
Downloading fake_8.mp4: 179kiB [00:00, 7.53MiB/s]
Downloading fake_9.mp4: 179kiB [00:00, 7.39MiB/s]
Downloading fake_10.mp4: 179kiB [00:00, 8.07MiB/s]



Downloading AUDIO samples

Real samples:


Downloading sample_1.wav: 6.21kiB [00:00, 10.5MiB/s]
Downloading sample_2.wav: 6.21kiB [00:00, 12.4MiB/s]
Downloading sample_3.wav: 6.21kiB [00:00, 11.4MiB/s]
Downloading sample_4.wav: 6.21kiB [00:00, 12.5MiB/s]
Downloading sample_5.wav: 6.21kiB [00:00, 11.2MiB/s]
Downloading sample_6.wav: 6.21kiB [00:00, 11.5MiB/s]
Downloading sample_7.wav: 6.21kiB [00:00, 9.53MiB/s]
Downloading sample_8.wav: 6.21kiB [00:00, 12.2MiB/s]
Downloading sample_9.wav: 6.21kiB [00:00, 894kiB/s]
Downloading sample_10.wav: 6.21kiB [00:00, 9.08MiB/s]



Fake samples:
Error downloading https://sample-videos.com/audio/synthetic/voice_1.wav: 404 Client Error: Not Found for url: https://sample-videos.com/audio/synthetic/voice_1.wav
Error downloading https://sample-videos.com/audio/synthetic/voice_2.wav: 404 Client Error: Not Found for url: https://sample-videos.com/audio/synthetic/voice_2.wav
Error downloading https://sample-videos.com/audio/synthetic/voice_3.wav: 404 Client Error: Not Found for url: https://sample-videos.com/audio/synthetic/voice_3.wav
Error downloading https://sample-videos.com/audio/synthetic/voice_4.wav: 404 Client Error: Not Found for url: https://sample-videos.com/audio/synthetic/voice_4.wav
Error downloading https://sample-videos.com/audio/synthetic/voice_5.wav: 404 Client Error: Not Found for url: https://sample-videos.com/audio/synthetic/voice_5.wav
Error downloading https://sample-videos.com/audio/synthetic/voice_6.wav: 404 Client Error: Not Found for url: https://sample-videos.com/audio/synthetic/voice_6.wav
E