<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qU kaggle pandas tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0m

In [2]:
import os
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import urllib.request
import hashlib
import zipfile
import tarfile

In [3]:
class MediaDownloader:
    def __init__(self, base_dir="./research_datasets"):
        self.base_dir = Path(base_dir)
        self.sources = {
            'images': {
                'real': [
                    # ImageNet-1k validation set samples (research/academic use)
                    'https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar',
                    # COCO 2017 validation set (research/academic use)
                    'http://images.cocodataset.org/zips/val2017.zip',
                    # Open Images V7 (CC BY 4.0)
                    'https://storage.googleapis.com/openimages/v7/validation/validation.zip'
                ],
                'manipulated': [
                    # GANForensics dataset (research use)
                    'https://github.com/peterwang512/GANFingerprints/releases/download/v1.0/GANFingerprints.zip',
                    # CNN-generated images dataset (MIT License)
                    'https://github.com/cc-hpc-itwm/DeepGenerator/raw/master/CNN_Generated_Images_Dataset.zip'
                ]
            },
            'videos': {
                'real': [
                    # Kinetics-400 validation subset (research use)
                    'https://storage.googleapis.com/deepmind-media/Datasets/kinetics400_val.zip',
                    # UCF101 dataset (research use)
                    'https://www.crcv.ucf.edu/data/UCF101/UCF101.rar'
                ],
                'manipulated': [
                    # DeepFake Detection Challenge preview set
                    'https://dfdc.ai/preview',
                    # FaceForensics++ sample set (research license required)
                    'https://github.com/ondyari/FaceForensics/tree/master/dataset/preview'
                ]
            },
            'audio': {
                'real': [
                    # VoxCeleb2 dev set (research use)
                    'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox2_dev_aac.zip',
                    # Mozilla Common Voice validated clips (CC0)
                    'https://commonvoice.mozilla.org/api/v1/bucket/dataset/cv-corpus-15.0-2023-09-08/en'
                ],
                'manipulated': [
                    # ASVspoof 2021 LA evaluation set
                    'https://datashare.ed.ac.uk/handle/10283/3336',
                    # Fake or Real (FOR) dataset
                    'https://ieee-dataport.org/documents/fake-or-real-for-dataset'
                ]
            }
        }

        # Dataset-specific authentication credentials
        self.auth_headers = {
            'imagenet': {'username': '', 'password': ''},  # Requires registration
            'voxceleb': {'username': '', 'password': ''},  # Requires registration
            'asvspoof': {'username': '', 'password': ''}   # Requires registration
        }

    def create_directories(self):
        """Create necessary directories for storing downloads"""
        for media_type in self.sources.keys():
            for category in ['real', 'manipulated']:
                path = self.base_dir / media_type / category
                path.mkdir(parents=True, exist_ok=True)

    def handle_authentication(self, url):
        """Handle authentication for restricted datasets"""
        if 'image-net.org' in url and all(self.auth_headers['imagenet'].values()):
            return self.auth_headers['imagenet']
        elif 'robots.ox.ac.uk' in url and all(self.auth_headers['voxceleb'].values()):
            return self.auth_headers['voxceleb']
        elif 'datashare.ed.ac.uk' in url and all(self.auth_headers['asvspoof'].values()):
            return self.auth_headers['asvspoof']
        return None

    def download_file(self, url, dest_path, desc="", retries=3):
        """Download a single file with progress bar, retry logic, and authentication"""
        for attempt in range(retries):
            try:
                auth = self.handle_authentication(url)
                with requests.get(url, stream=True, timeout=30, auth=auth) as response:
                    response.raise_for_status()
                    total_size = int(response.headers.get('content-length', 0))

                    with open(dest_path, 'wb') as f, tqdm(
                        desc=f"{desc} (Attempt {attempt + 1}/{retries})",
                        total=total_size,
                        unit='iB',
                        unit_scale=True,
                        unit_divisor=1024,
                    ) as pbar:
                        for data in response.iter_content(chunk_size=8192):
                            size = f.write(data)
                            pbar.update(size)

                # Handle compressed files
                if str(dest_path).endswith(('.zip', '.tar', '.rar')):
                    self.extract_archive(dest_path)
                return True

            except requests.exceptions.RequestException as e:
                if attempt == retries - 1:
                    print(f"Error downloading {url} after {retries} attempts: {str(e)}")
                    if response.status_code == 403:
                        print("Authentication may be required. Please check dataset access requirements.")
                    return False
                print(f"Attempt {attempt + 1} failed, retrying...")
                continue
            except Exception as e:
                print(f"Unexpected error downloading {url}: {str(e)}")
                return False

    def extract_archive(self, archive_path):
        """Extract downloaded archives"""
        extract_path = archive_path.parent
        print(f"Extracting {archive_path} to {extract_path}")

        try:
            if archive_path.suffix == '.zip':
                with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_path)
            elif archive_path.suffix in ['.tar', '.gz']:
                with tarfile.open(archive_path, 'r:*') as tar_ref:
                    tar_ref.extractall(extract_path)

            # Remove archive after extraction
            archive_path.unlink()
            print(f"Successfully extracted and removed {archive_path}")
        except Exception as e:
            print(f"Error extracting {archive_path}: {str(e)}")

    def verify_download(self, file_path, expected_hash=None):
        """Verify downloaded file integrity"""
        if not expected_hash:
            return True

        sha256_hash = hashlib.sha256()
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest() == expected_hash

    def download_all(self, skip_existing=True):
        """Download all media samples with improved error handling"""
        self.create_directories()
        summary = {media_type: {'real': 0, 'manipulated': 0} for media_type in self.sources.keys()}

        print("Note: Some datasets require registration and authentication.")
        print("Please visit dataset websites to obtain necessary credentials.\n")

        for media_type, categories in self.sources.items():
            print(f"\nDownloading {media_type}...")
            for category, urls in categories.items():
                for i, url in enumerate(urls):
                    filename = f"{category}_{i:03d}{Path(url).suffix}"
                    dest_path = self.base_dir / media_type / category / filename

                    if skip_existing and dest_path.exists():
                        print(f"Skipping existing file: {filename}")
                        summary[media_type][category] += 1
                        continue

                    if self.download_file(url, dest_path, f"{media_type}/{category}/{filename}"):
                        summary[media_type][category] += 1

        return summary

    def generate_report(self, summary):
        """Generate a detailed summary report of downloaded files"""
        report = "Download Summary\n" + "="*20 + "\n"
        total_files = 0

        for media_type, categories in summary.items():
            report += f"\n{media_type.title()}:\n"
            type_total = categories['real'] + categories['manipulated']
            report += f"  Real: {categories['real']} files\n"
            report += f"  Manipulated: {categories['manipulated']} files\n"
            report += f"  Total: {type_total} files\n"
            total_files += type_total

        report += f"\nTotal files downloaded: {total_files}"
        report += f"\nStorage location: {self.base_dir}"
        report += "\n\nNote: Some datasets may require registration and authentication."
        report += "\nPlease visit the respective dataset websites for access:"
        report += "\n- ImageNet: https://image-net.org/"
        report += "\n- VoxCeleb: https://www.robots.ox.ac.uk/~vgg/data/voxceleb/"
        report += "\n- ASVspoof: https://www.asvspoof.org/"
        return report

In [4]:
def main():
    downloader = MediaDownloader()
    print("Starting media sample downloads...")
    summary = downloader.download_all(skip_existing=True)
    print("\n" + downloader.generate_report(summary))

if __name__ == "__main__":
    main()

Starting media sample downloads...
Note: Some datasets require registration and authentication.
Please visit dataset websites to obtain necessary credentials.


Downloading images...


images/real/real_000.tar (Attempt 1/3): 100%|██████████| 6.28G/6.28G [02:00<00:00, 56.1MiB/s]


Extracting research_datasets/images/real/real_000.tar to research_datasets/images/real
Successfully extracted and removed research_datasets/images/real/real_000.tar


images/real/real_001.zip (Attempt 1/3): 100%|██████████| 778M/778M [00:08<00:00, 98.1MiB/s]


Extracting research_datasets/images/real/real_001.zip to research_datasets/images/real
Successfully extracted and removed research_datasets/images/real/real_001.zip
Attempt 1 failed, retrying...
Attempt 2 failed, retrying...
Error downloading https://storage.googleapis.com/openimages/v7/validation/validation.zip after 3 attempts: 403 Client Error: Forbidden for url: https://storage.googleapis.com/openimages/v7/validation/validation.zip
Authentication may be required. Please check dataset access requirements.
Attempt 1 failed, retrying...
Attempt 2 failed, retrying...
Error downloading https://github.com/peterwang512/GANFingerprints/releases/download/v1.0/GANFingerprints.zip after 3 attempts: 404 Client Error: Not Found for url: https://github.com/peterwang512/GANFingerprints/releases/download/v1.0/GANFingerprints.zip
Attempt 1 failed, retrying...
Attempt 2 failed, retrying...
Error downloading https://github.com/cc-hpc-itwm/DeepGenerator/raw/master/CNN_Generated_Images_Dataset.zip afte

UnboundLocalError: cannot access local variable 'response' where it is not associated with a value