<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU kaggle pandas tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0m

In [5]:
import os
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import urllib.request
import hashlib
import zipfile
import tarfile
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [6]:
class DeepFakeDownloader:
    def __init__(self, base_dir="./deepfake_test_data"):
        self.base_dir = Path(base_dir)

        # Configure session with retry strategy
        self.session = requests.Session()
        retries = Retry(
            total=3,
            backoff_factor=0.5,
            status_forcelist=[500, 502, 503, 504]
        )
        self.session.mount('http://', HTTPAdapter(max_retries=retries))
        self.session.mount('https://', HTTPAdapter(max_retries=retries))

        # Focus only on manipulated media test sets
        self.sources = {
            'images': {
                # FaceForensics++ test set samples
                'https://github.com/ondyari/FaceForensics/raw/master/dataset/test_set.zip',
                # CelebDF-v2 test set
                'https://github.com/yuezunli/celeb-deepfakeforensics/raw/master/test_release.zip',
                # DFDC Preview Test Set
                'https://dfdc-preview-test.s3.amazonaws.com/dfdc_test_set.zip'
            },
            'videos': {
                # DeepFake Detection Challenge test samples
                'https://dfdc.ai/api/get/test_sample.zip',
                # FakeAVCeleb test set
                'https://fakeavceleb.com/download/test_set.zip',
                # DeeperForensics-1.0 test set
                'https://github.com/EndlessSora/DeeperForensics-1.0/raw/master/test.zip'
            },
            'audio': {
                # ASVspoof 2021 LA evaluation set
                'https://datashare.ed.ac.uk/bitstream/handle/10283/3336/LA_test.zip',
                # FakeVoice test set
                'https://github.com/fakevoice/dataset/raw/main/test_set.zip'
            }
        }

        # Dataset access credentials (to be filled by user)
        self.credentials = {
            'dfdc': {'api_key': ''},
            'asvspoof': {'username': '', 'password': ''},
            'celeb_df': {'access_token': ''}
        }

    def download_file(self, url, dest_path, desc="", chunk_size=8192):
        """Download a file with progress tracking"""
        try:
            # Handle authentication if needed
            headers = {}
            if 'dfdc.ai' in url and self.credentials['dfdc']['api_key']:
                headers['Authorization'] = f"Bearer {self.credentials['dfdc']['api_key']}"
            elif 'datashare.ed.ac.uk' in url:
                auth = (self.credentials['asvspoof']['username'],
                       self.credentials['asvspoof']['password'])
            else:
                auth = None

            response = self.session.get(url,
                                      stream=True,
                                      timeout=30,
                                      headers=headers,
                                      auth=auth)
            response.raise_for_status()

            total_size = int(response.headers.get('content-length', 0))
            dest_path.parent.mkdir(parents=True, exist_ok=True)

            with open(dest_path, 'wb') as f, tqdm(
                desc=desc,
                total=total_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
            ) as pbar:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    size = f.write(chunk)
                    pbar.update(size)

            # Extract if it's a compressed file
            if dest_path.suffix in ['.zip', '.tar', '.gz']:
                self.extract_archive(dest_path)
            return True

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url}: {str(e)}")
            if hasattr(e.response, 'status_code') and e.response.status_code == 403:
                print("Authentication required. Please check credentials.")
            return False
        except Exception as e:
            print(f"Unexpected error: {str(e)}")
            return False

    def extract_archive(self, archive_path):
        """Extract downloaded archives"""
        try:
            print(f"Extracting {archive_path}")
            extract_path = archive_path.parent / archive_path.stem

            if archive_path.suffix == '.zip':
                with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_path)
            elif archive_path.suffix in ['.tar', '.gz']:
                with tarfile.open(archive_path, 'r:*') as tar_ref:
                    tar_ref.extractall(extract_path)

            archive_path.unlink()
            print(f"Successfully extracted to {extract_path}")

        except Exception as e:
            print(f"Error extracting {archive_path}: {str(e)}")

    def download_all(self, skip_existing=True):
        """Download all test datasets"""
        summary = {media_type: 0 for media_type in self.sources.keys()}

        for media_type, urls in self.sources.items():
            print(f"\nDownloading {media_type} test sets...")
            media_dir = self.base_dir / media_type

            for i, url in enumerate(urls):
                filename = f"test_set_{i}{Path(url).suffix}"
                dest_path = media_dir / filename

                if skip_existing and dest_path.exists():
                    print(f"Skipping existing file: {filename}")
                    summary[media_type] += 1
                    continue

                if self.download_file(url, dest_path, f"{media_type}/{filename}"):
                    summary[media_type] += 1

        return summary

    def generate_report(self, summary):
        """Generate download summary"""
        report = "DeepFake Test Data Download Summary\n"
        report += "=" * 35 + "\n\n"

        total_files = 0
        for media_type, count in summary.items():
            report += f"{media_type.title()}: {count} test sets\n"
            total_files += count

        report += f"\nTotal test sets downloaded: {total_files}"
        report += f"\nStorage location: {self.base_dir}"
        return report

In [7]:
def main():
    downloader = DeepFakeDownloader()
    print("Starting deepfake test data download...")
    summary = downloader.download_all()
    print("\n" + downloader.generate_report(summary))

if __name__ == "__main__":
    main()

Starting deepfake test data download...

Downloading images test sets...
Error downloading https://github.com/yuezunli/celeb-deepfakeforensics/raw/master/test_release.zip: 404 Client Error: Not Found for url: https://github.com/yuezunli/celeb-deepfakeforensics/raw/master/test_release.zip
Error downloading https://dfdc-preview-test.s3.amazonaws.com/dfdc_test_set.zip: 404 Client Error: Not Found for url: https://dfdc-preview-test.s3.amazonaws.com/dfdc_test_set.zip




Error downloading https://github.com/ondyari/FaceForensics/raw/master/dataset/test_set.zip: 404 Client Error: Not Found for url: https://github.com/ondyari/FaceForensics/raw/master/dataset/test_set.zip

Downloading videos test sets...




Error downloading https://dfdc.ai/api/get/test_sample.zip: HTTPSConnectionPool(host='dfdc.ai', port=443): Max retries exceeded with url: /api/get/test_sample.zip (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1006)')))




Error downloading https://github.com/EndlessSora/DeeperForensics-1.0/raw/master/test.zip: 404 Client Error: Not Found for url: https://github.com/EndlessSora/DeeperForensics-1.0/raw/master/test.zip




Error downloading https://fakeavceleb.com/download/test_set.zip: HTTPSConnectionPool(host='fakeavceleb.com', port=443): Max retries exceeded with url: /download/test_set.zip (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7e4d31507710>: Failed to resolve 'fakeavceleb.com' ([Errno -2] Name or service not known)"))

Downloading audio test sets...
Error downloading https://datashare.ed.ac.uk/bitstream/handle/10283/3336/LA_test.zip: 404 Client Error: 404 for url: https://datashare.ed.ac.uk/bitstream/handle/10283/3336/LA_test.zip
Error downloading https://github.com/fakevoice/dataset/raw/main/test_set.zip: 404 Client Error: Not Found for url: https://github.com/fakevoice/dataset/raw/main/test_set.zip

DeepFake Test Data Download Summary

Images: 0 test sets
Videos: 0 test sets
Audio: 0 test sets

Total test sets downloaded: 0
Storage location: deepfake_test_data
