<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Deepfake and Manipulated Media Analysis Data Download**

In [None]:
!pip install -qU kaggle pandas tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0m

In [12]:
import os
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import zipfile
import tarfile
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import json
from datetime import datetime

In [13]:
def create_robust_session():
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session

In [14]:
def check_url_availability(session, url):
    try:
        response = session.head(url, timeout=10)
        return response.status_code == 200
    except Exception as e:
        print(f"URL check failed for {url}: {str(e)}")
        return False

In [15]:
def download_file(session, url, dest_path, desc=""):
    try:
        if not check_url_availability(session, url):
            print(f"URL not accessible: {url}")
            return False

        response = session.get(url, stream=True, timeout=30)
        response.raise_for_status()
        total_size = int(response.headers.get('content-length', 0))

        os.makedirs(dest_path.parent, exist_ok=True)
        temp_path = dest_path.with_suffix('.temp')

        with open(temp_path, 'wb') as f, tqdm(
            desc=desc,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                size = f.write(chunk)
                pbar.update(size)

        temp_path.rename(dest_path)

        if dest_path.suffix in ['.zip', '.tar', '.gz']:
            extract_archive(dest_path)

        print(f"Successfully downloaded: {dest_path}")
        return True
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

In [16]:
def extract_archive(archive_path):
    try:
        print(f"Extracting {archive_path}")
        extract_path = archive_path.parent / archive_path.stem

        if archive_path.suffix == '.zip':
            with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                zip_ref.extractall(extract_path)
        elif archive_path.suffix in ['.tar', '.gz']:
            with tarfile.open(archive_path, 'r:*') as tar_ref:
                tar_ref.extractall(extract_path)

        archive_path.unlink()
        print(f"Successfully extracted to {extract_path}")
    except Exception as e:
        print(f"Error extracting {archive_path}: {str(e)}")

In [17]:
def download_all(base_dir):
    session = create_robust_session()
    sources = {
        'images': {
            'Deepfake Detection Challenge Sample': {
                'url': 'https://github.com/selimsef/dfdc_deepfake_challenge/releases/download/0.0.1/example_videos.zip'
            },
            'FaceForensics++ Sample': {
                'url': 'https://github.com/ondyari/FaceForensics/blob/master/dataset/sample_videos.zip?raw=true'
            }
        },
        'videos': {
            'Celeb-DF Sample': {
                'url': 'https://github.com/yuezunli/celeb-deepfakeforensics/blob/master/sample_videos.zip?raw=true'
            }
        }
    }
    summary = {media_type: {'success': 0, 'failed': 0} for media_type in sources.keys()}

    for media_type, datasets in sources.items():
        for dataset_name, info in datasets.items():
            dest_path = Path(base_dir) / media_type / f"{dataset_name}.zip"
            success = download_file(session, info['url'], dest_path, f"Downloading {dataset_name}")
            if success:
                summary[media_type]['success'] += 1
            else:
                summary[media_type]['failed'] += 1

    return summary

In [18]:
def generate_report(summary, base_dir):
    report = "Public DeepFake Test Data Download Summary\n"
    report += "=" * 40 + "\n\n"

    total_success = sum(v['success'] for v in summary.values())
    total_failed = sum(v['failed'] for v in summary.values())

    for media_type, counts in summary.items():
        report += f"{media_type.title()}:\n"
        report += f"  - Successfully downloaded: {counts['success']}\n"
        report += f"  - Failed downloads: {counts['failed']}\n\n"

    report += f"Overall Statistics:\n"
    report += f"  - Total successful downloads: {total_success}\n"
    report += f"  - Total failed downloads: {total_failed}\n\n"
    report += f"Storage location: {base_dir}\n"
    return report

In [19]:
def main():
    base_dir = "./deepfake_test_data"
    print("Starting deepfake test data download...")
    summary = download_all(base_dir)
    print(generate_report(summary, base_dir))

if __name__ == "__main__":
    main()

Starting deepfake test data download...
URL not accessible: https://github.com/selimsef/dfdc_deepfake_challenge/releases/download/0.0.1/example_videos.zip
URL not accessible: https://github.com/ondyari/FaceForensics/blob/master/dataset/sample_videos.zip?raw=true
URL not accessible: https://github.com/yuezunli/celeb-deepfakeforensics/blob/master/sample_videos.zip?raw=true
Public DeepFake Test Data Download Summary

Images:
  - Successfully downloaded: 0
  - Failed downloads: 2

Videos:
  - Successfully downloaded: 0
  - Failed downloads: 1

Overall Statistics:
  - Total successful downloads: 0
  - Total failed downloads: 3

Storage location: ./deepfake_test_data

