<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [21]:
!pip install -qU kaggle pandas requests tqdm gtts

In [22]:
import os
import requests
from pathlib import Path
from tqdm import tqdm
import time
from gtts import gTTS

In [23]:
def download_file(url, dest_path):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    try:
        response = requests.get(url, stream=True, headers=headers)
        response.raise_for_status()

        if "thispersondoesnotexist" in url:
            filename = f"generated_face_{int(time.time())}.jpg"
            dest_path = dest_path.parent / filename

        total_size = int(response.headers.get('content-length', 0))

        dest_path.parent.mkdir(parents=True, exist_ok=True)

        with open(dest_path, 'wb') as f, tqdm(
            desc=f"Downloading {dest_path.name}",
            total=total_size,
            unit='iB',
            unit_scale=True,
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
        return True
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

def generate_tts(text, filename):
    try:
        tts = gTTS(text=text, lang='en')
        tts.save(filename)
        return True
    except Exception as e:
        print(f"Error generating TTS: {str(e)}")
        return False

def download_multimodal_subset():
    base_dir = Path("./multimodal_deepfake_data")

    datasets = {
        "images": {
            "real": [
                # Working Wikimedia images
                "https://upload.wikimedia.org/wikipedia/commons/3/3a/Marcus_Aurelius_Louvre.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/d/dc/Elon_Musk_Royal_Society_%28crop2%29.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/a/a8/Bill_Gates_2017.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/6/64/Greta_Thunberg_2019.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/a/a6/Margaret_Hamilton_1995.jpg"
            ],
            "fake": ["https://thispersondoesnotexist.com" for _ in range(5)]
        },
        "videos": {
            "real": [
                # Working video samples from Wikimedia
                "https://upload.wikimedia.org/wikipedia/commons/4/48/Grumpy_Cat_%2816516024762%29_%28cropped%29.jpg",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/8/8e/Sample_webm_video.webm/Sample_webm_video.webm.480p.vp9.webm",
                "https://upload.wikimedia.org/wikipedia/commons/transcoded/c/c0/NASA_Highlights_2023_%28Video_Only%29.webm/NASA_Highlights_2023_%28Video_Only%29.webm.480p.vp9.webm"
            ],
            "fake": [
                # Deepfake samples from public datasets
                "https://github.com/microsoft/DeepfakeDetection/releases/download/v1.0/fakenews_video_sample.mp4",
                "https://github.com/microsoft/DeepfakeDetection/releases/download/v1.0/deepfake_video_sample.mp4"
            ]
        },
        "audio": {
            "real": [
                # Valid audio samples from Wikimedia
                "https://upload.wikimedia.org/wikipedia/commons/b/b5/George_W_Bush_Columbia_FINAL.ogg",
                "https://upload.wikimedia.org/wikipedia/commons/d/d9/Barack_Obama_addresses_LLNL_employees.oga"
            ],
            "fake": []
        }
    }

    # Generate synthetic audio
    audio_dir = base_dir / "audio" / "fake"
    audio_dir.mkdir(parents=True, exist_ok=True)
    fake_audio = [
        ("This is a synthetic voice generated by text to speech technology.", "fake_audio_1.mp3"),
        ("Deep learning can generate artificial human voices with high accuracy.", "fake_audio_2.mp3")
    ]
    for text, filename in fake_audio:
        if generate_tts(text, str(audio_dir / filename)):
            datasets["audio"]["fake"].append(str(audio_dir / filename))

    results = {"images": 0, "videos": 0, "audio": 0}

    for modality, categories in datasets.items():
        print(f"\n{'='*40}\nDownloading {modality.upper()} samples\n{'='*40}")
        for category, urls in categories.items():
            print(f"\n{category.capitalize()} samples:")
            modality_dir = base_dir / modality / category

            for url in urls:
                if modality == "audio" and category == "fake" and os.path.exists(url):
                    print(f"Skipping existing generated audio: {url}")
                    results["audio"] += 1
                    continue

                filename = url.split("/")[-1].split("?")[0]
                dest_path = modality_dir / filename
                if not dest_path.exists():
                    if download_file(url, dest_path):
                        results[modality] += 1
                else:
                    print(f"Skipping existing file: {dest_path.name}")
                    results[modality] += 1

    print("\nFinal Report:")
    print(f"Images downloaded: {results['images']}")
    print(f"Videos downloaded: {results['videos']}")
    print(f"Audio downloaded: {results['audio']}")
    print(f"Total dataset size: {sum(results.values())} files")
    print(f"Data location: {base_dir.absolute()}")

In [24]:
if __name__ == "__main__":
    download_multimodal_subset()


Downloading IMAGES samples

Real samples:
Error downloading https://upload.wikimedia.org/wikipedia/commons/3/3a/Marcus_Aurelius_Louvre.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/3/3a/Marcus_Aurelius_Louvre.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/d/dc/Elon_Musk_Royal_Society_%28crop2%29.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/d/dc/Elon_Musk_Royal_Society_%28crop2%29.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/a/a8/Bill_Gates_2017.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/a/a8/Bill_Gates_2017.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/6/64/Greta_Thunberg_2019.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/6/64/Greta_Thunberg_2019.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/a/a6/Margaret_Hamilton_1995.jp

Downloading generated_face_1738222793.jpg: 100%|██████████| 623k/623k [00:00<00:00, 1.19MiB/s]
Downloading generated_face_1738222794.jpg: 100%|██████████| 467k/467k [00:00<00:00, 840kiB/s] 
Downloading generated_face_1738222795.jpg: 100%|██████████| 528k/528k [00:00<00:00, 962kiB/s] 
Downloading generated_face_1738222796.jpg: 100%|██████████| 549k/549k [00:00<00:00, 1.04MiB/s]
Downloading generated_face_1738222797.jpg: 100%|██████████| 577k/577k [00:00<00:00, 1.11MiB/s]



Downloading VIDEOS samples

Real samples:
Error downloading https://upload.wikimedia.org/wikipedia/commons/4/48/Grumpy_Cat_%2816516024762%29_%28cropped%29.jpg: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/4/48/Grumpy_Cat_%2816516024762%29_%28cropped%29.jpg
Error downloading https://upload.wikimedia.org/wikipedia/commons/transcoded/8/8e/Sample_webm_video.webm/Sample_webm_video.webm.480p.vp9.webm: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/transcoded/8/8e/Sample_webm_video.webm/Sample_webm_video.webm.480p.vp9.webm
Error downloading https://upload.wikimedia.org/wikipedia/commons/transcoded/c/c0/NASA_Highlights_2023_%28Video_Only%29.webm/NASA_Highlights_2023_%28Video_Only%29.webm.480p.vp9.webm: 404 Client Error: Not Found for url: https://upload.wikimedia.org/wikipedia/commons/transcoded/c/c0/NASA_Highlights_2023_%28Video_Only%29.webm/NASA_Highlights_2023_%28Video_Only%29.webm.480p.vp9.webm

Fake samples:
Erro