<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install -qU kaggle pandas tqdm

In [22]:
import os
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import urllib.request
import hashlib

In [30]:
class MediaDownloader:
    def __init__(self, base_dir="./test_samples"):
        self.base_dir = Path(base_dir)
        self.sources = {
            'images': {
                'real': [
                    # FFHQ Dataset samples (Creative Commons BY-NC-SA 4.0)
                    'https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00000.png',
                    'https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00001.png',
                    'https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00002.png',
                    'https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00003.png',
                    'https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00004.png'
                ],
                'manipulated': [
                    # This Machine Does Not Exist samples (free for research)
                    'https://thispersondoesnotexist.com/image',
                    'https://thispersondoesnotexist.com/image',
                    'https://thispersondoesnotexist.com/image',
                    'https://thispersondoesnotexist.com/image',
                    'https://thispersondoesnotexist.com/image'
                ]
            },
            'videos': {
                'real': [
                    # FaceForensics++ sample videos (research license)
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/original_sequences/youtube/c23/videos/000.mp4',
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/original_sequences/youtube/c23/videos/001.mp4',
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/original_sequences/youtube/c23/videos/002.mp4',
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/original_sequences/youtube/c23/videos/003.mp4',
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/original_sequences/youtube/c23/videos/004.mp4'
                ],
                'manipulated': [
                    # FaceForensics++ manipulated samples
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/manipulated_sequences/Deepfakes/c23/videos/000.mp4',
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/manipulated_sequences/Deepfakes/c23/videos/001.mp4',
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/manipulated_sequences/Deepfakes/c23/videos/002.mp4',
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/manipulated_sequences/Deepfakes/c23/videos/003.mp4',
                    'https://github.com/ondyari/FaceForensics/raw/master/dataset/sample_videos/manipulated_sequences/Deepfakes/c23/videos/004.mp4'
                ]
            },
            'audio': {
                'real': [
                    # Common Voice samples (Creative Commons)
                    'https://common-voice-data-download.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en/clips/common_voice_en_18885784.mp3',
                    'https://common-voice-data-download.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en/clips/common_voice_en_18885785.mp3',
                    'https://common-voice-data-download.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en/clips/common_voice_en_18885786.mp3',
                    'https://common-voice-data-download.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en/clips/common_voice_en_18885787.mp3',
                    'https://common-voice-data-download.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en/clips/common_voice_en_18885788.mp3'
                ],
                'manipulated': [
                    # Synthetic speech samples
                    'https://github.com/mozilla/TTS/raw/master/samples/ljspeech/glow-tts/sample_1.wav',
                    'https://github.com/mozilla/TTS/raw/master/samples/ljspeech/glow-tts/sample_2.wav',
                    'https://github.com/mozilla/TTS/raw/master/samples/ljspeech/glow-tts/sample_3.wav',
                    'https://github.com/mozilla/TTS/raw/master/samples/ljspeech/glow-tts/sample_4.wav',
                    'https://github.com/mozilla/TTS/raw/master/samples/ljspeech/glow-tts/sample_5.wav'
                ]
            }
        }

    def create_directories(self):
        """Create necessary directories for storing downloads"""
        for media_type in self.sources.keys():
            for category in ['real', 'manipulated']:
                path = self.base_dir / media_type / category
                path.mkdir(parents=True, exist_ok=True)

    def download_file(self, url, dest_path, desc=""):
        """Download a single file with progress bar"""
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            total_size = int(response.headers.get('content-length', 0))

            with open(dest_path, 'wb') as f, tqdm(
                desc=desc,
                total=total_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
            ) as pbar:
                for data in response.iter_content(chunk_size=1024):
                    size = f.write(data)
                    pbar.update(size)
            return True
        except Exception as e:
            print(f"Error downloading {url}: {str(e)}")
            return False

    def generate_filename(self, url, media_type, category, index):
        """Generate a unique filename for the download"""
        ext = url.split('.')[-1] if '.' in url else {
            'images': 'png',
            'videos': 'mp4',
            'audio': 'mp3'
        }[media_type]

        return f"{category}_{index:03d}.{ext}"

    def download_all(self):
        """Download all media samples"""
        self.create_directories()
        summary = {media_type: {'real': 0, 'manipulated': 0} for media_type in self.sources.keys()}

        for media_type, categories in self.sources.items():
            print(f"\nDownloading {media_type}...")
            for category, urls in categories.items():
                for i, url in enumerate(urls):
                    filename = self.generate_filename(url, media_type, category, i)
                    dest_path = self.base_dir / media_type / category / filename

                    if self.download_file(url, dest_path, f"{media_type}/{category}/{filename}"):
                        summary[media_type][category] += 1

        return summary

    def generate_report(self, summary):
        """Generate a summary report of downloaded files"""
        report = "Download Summary\n" + "="*20 + "\n"
        for media_type, categories in summary.items():
            report += f"\n{media_type.title()}:\n"
            report += f"  Real: {categories['real']} files\n"
            report += f"  Manipulated: {categories['manipulated']} files\n"
        return report

In [31]:
def main():
    downloader = MediaDownloader()
    print("Starting media sample downloads...")
    summary = downloader.download_all()
    print("\n" + downloader.generate_report(summary))
    print(f"\nFiles downloaded to: {downloader.base_dir}")

if __name__ == "__main__":
    main()

Starting media sample downloads...

Downloading images...
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00000.png: 404 Client Error: Not Found for url: https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00000.png
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00001.png: 404 Client Error: Not Found for url: https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00001.png
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00002.png: 404 Client Error: Not Found for url: https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00002.png
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00003.png: 404 Client Error: Not Found for url: https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00003.png
Error downloading https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails/00004.png: 404 Client Error: Not Found for url: https://github.com/NVla