<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU kaggle pandas tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0m

In [8]:
import os
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import zipfile
import tarfile
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import json
from datetime import datetime
import logging

In [9]:
class PublicDeepFakeDownloader:
    def __init__(self, base_dir="./deepfake_test_data"):
        self.base_dir = Path(base_dir)
        self.setup_logging()

        # Configure session with robust retry strategy
        self.session = self.create_robust_session()

        # Only publicly available datasets
        self.sources = {
            'images': {
                'Deepfake Detection Challenge Sample': {
                    'url': 'https://github.com/selimsef/dfdc_deepfake_challenge/releases/download/0.0.1/example_videos.zip',
                    'description': 'Sample videos from the DFDC dataset'
                },
                'FaceForensics++ Sample': {
                    'url': 'https://github.com/ondyari/FaceForensics/blob/master/dataset/sample_videos.zip?raw=true',
                    'description': 'Sample videos from FaceForensics++'
                }
            },
            'videos': {
                'Celeb-DF Sample': {
                    'url': 'https://github.com/yuezunli/celeb-deepfakeforensics/blob/master/sample_videos.zip?raw=true',
                    'description': 'Sample videos from Celeb-DF dataset'
                }
            }
        }

    def setup_logging(self):
        """Configure logging with detailed formatting"""
        self.base_dir.mkdir(parents=True, exist_ok=True)
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(self.base_dir / 'download.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def create_robust_session(self):
        """Create a session with comprehensive retry strategy"""
        session = requests.Session()
        retries = Retry(
            total=3,
            backoff_factor=0.5,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        session.mount('http://', HTTPAdapter(max_retries=retries))
        session.mount('https://', HTTPAdapter(max_retries=retries))
        return session

    def check_url_availability(self, url):
        """Verify if URL is accessible"""
        try:
            response = self.session.head(url, timeout=10)
            return response.status_code == 200
        except Exception as e:
            self.logger.warning(f"URL check failed for {url}: {str(e)}")
            return False

    def download_file(self, url, dest_path, desc=""):
        """Download file with progress tracking"""
        try:
            # First check if URL is accessible
            if not self.check_url_availability(url):
                self.logger.error(f"URL not accessible: {url}")
                return False

            response = self.session.get(url, stream=True, timeout=30)
            response.raise_for_status()

            total_size = int(response.headers.get('content-length', 0))
            dest_path.parent.mkdir(parents=True, exist_ok=True)

            temp_path = dest_path.with_suffix('.temp')
            with open(temp_path, 'wb') as f, tqdm(
                desc=desc,
                total=total_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
            ) as pbar:
                for chunk in response.iter_content(chunk_size=8192):
                    size = f.write(chunk)
                    pbar.update(size)

            # Move temporary file to final destination
            temp_path.rename(dest_path)

            # Extract if it's a compressed file
            if dest_path.suffix in ['.zip', '.tar', '.gz']:
                self.extract_archive(dest_path)

            self.logger.info(f"Successfully downloaded: {dest_path}")
            return True

        except Exception as e:
            self.logger.error(f"Error downloading {url}: {str(e)}")
            if hasattr(e, 'response'):
                self.logger.error(f"Response status code: {e.response.status_code}")
            return False

    def extract_archive(self, archive_path):
        """Extract downloaded archives"""
        try:
            self.logger.info(f"Extracting {archive_path}")
            extract_path = archive_path.parent / archive_path.stem

            if archive_path.suffix == '.zip':
                with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_path)
            elif archive_path.suffix in ['.tar', '.gz']:
                with tarfile.open(archive_path, 'r:*') as tar_ref:
                    tar_ref.extractall(extract_path)

            # Remove the archive after successful extraction
            archive_path.unlink()
            self.logger.info(f"Successfully extracted to {extract_path}")

        except Exception as e:
            self.logger.error(f"Error extracting {archive_path}: {str(e)}")

    def download_all(self, skip_existing=True):
        """Download all public datasets"""
        summary = {media_type: {'success': 0, 'failed': 0, 'skipped': 0}
                  for media_type in self.sources.keys()}

        for media_type, datasets in self.sources.items():
            self.logger.info(f"\nProcessing {media_type} datasets...")

            for dataset_name, info in datasets.items():
                dest_path = self.base_dir / media_type / f"{dataset_name}.zip"

                if skip_existing and dest_path.exists():
                    self.logger.info(f"Skipping existing dataset: {dataset_name}")
                    summary[media_type]['skipped'] += 1
                    continue

                success = self.download_file(
                    info['url'],
                    dest_path,
                    f"Downloading {dataset_name}"
                )

                if success:
                    summary[media_type]['success'] += 1
                else:
                    summary[media_type]['failed'] += 1

        self.save_summary(summary)
        return summary

    def save_summary(self, summary):
        """Save download summary to JSON"""
        summary_path = self.base_dir / 'download_summary.json'
        with open(summary_path, 'w') as f:
            json.dump({
                'timestamp': datetime.now().isoformat(),
                'summary': summary
            }, f, indent=2)

    def generate_report(self, summary):
        """Generate detailed download report"""
        report = "Public DeepFake Test Data Download Summary\n"
        report += "=" * 40 + "\n\n"

        total_success = 0
        total_failed = 0
        total_skipped = 0

        for media_type, counts in summary.items():
            success = counts['success']
            failed = counts['failed']
            skipped = counts['skipped']
            total = success + failed + skipped

            report += f"{media_type.title()}:\n"
            report += f"  - Successfully downloaded: {success}\n"
            report += f"  - Failed downloads: {failed}\n"
            report += f"  - Skipped (already exists): {skipped}\n"
            report += f"  - Total datasets: {total}\n\n"

            total_success += success
            total_failed += failed
            total_skipped += skipped

        report += f"Overall Statistics:\n"
        report += f"  - Total successful downloads: {total_success}\n"
        report += f"  - Total failed downloads: {total_failed}\n"
        report += f"  - Total skipped: {total_skipped}\n"
        if total_success + total_failed > 0:
            success_rate = (total_success/(total_success+total_failed)*100)
            report += f"  - Success rate: {success_rate:.1f}%\n\n"

        report += f"Storage location: {self.base_dir}\n"
        report += f"Detailed logs available at: {self.base_dir}/download.log"

        return report

In [10]:
def main():
    downloader = DeepFakeDownloader()
    print("Starting deepfake test data download...")
    summary = downloader.download_all()
    print("\n" + downloader.generate_report(summary))

if __name__ == "__main__":
    main()

Starting deepfake test data download...

Downloading images test sets...
Error downloading https://github.com/yuezunli/celeb-deepfakeforensics/raw/master/test_release.zip: 404 Client Error: Not Found for url: https://github.com/yuezunli/celeb-deepfakeforensics/raw/master/test_release.zip
Error downloading https://dfdc-preview-test.s3.amazonaws.com/dfdc_test_set.zip: 404 Client Error: Not Found for url: https://dfdc-preview-test.s3.amazonaws.com/dfdc_test_set.zip




Error downloading https://github.com/ondyari/FaceForensics/raw/master/dataset/test_set.zip: 404 Client Error: Not Found for url: https://github.com/ondyari/FaceForensics/raw/master/dataset/test_set.zip

Downloading videos test sets...




Error downloading https://dfdc.ai/api/get/test_sample.zip: HTTPSConnectionPool(host='dfdc.ai', port=443): Max retries exceeded with url: /api/get/test_sample.zip (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1006)')))




Error downloading https://github.com/EndlessSora/DeeperForensics-1.0/raw/master/test.zip: 404 Client Error: Not Found for url: https://github.com/EndlessSora/DeeperForensics-1.0/raw/master/test.zip




Error downloading https://fakeavceleb.com/download/test_set.zip: HTTPSConnectionPool(host='fakeavceleb.com', port=443): Max retries exceeded with url: /download/test_set.zip (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7e4d315058d0>: Failed to resolve 'fakeavceleb.com' ([Errno -2] Name or service not known)"))

Downloading audio test sets...
Error downloading https://datashare.ed.ac.uk/bitstream/handle/10283/3336/LA_test.zip: 404 Client Error: 404 for url: https://datashare.ed.ac.uk/bitstream/handle/10283/3336/LA_test.zip
Error downloading https://github.com/fakevoice/dataset/raw/main/test_set.zip: 404 Client Error: Not Found for url: https://github.com/fakevoice/dataset/raw/main/test_set.zip

DeepFake Test Data Download Summary

Images: 0 test sets
Videos: 0 test sets
Audio: 0 test sets

Total test sets downloaded: 0
Storage location: deepfake_test_data
