In [None]:
# Setup Kaggle authentication
import os
import shutil
from kaggle.api.kaggle_api_extended import KaggleApi

# Make directory and move credentials
os.makedirs(".kaggle", exist_ok=True)
shutil.move(".kaggle.json", ".kaggle/kaggle.json")

# Set environment variable and authenticate
os.environ['KAGGLE_CONFIG_DIR'] = os.path.abspath('.kaggle')
api = KaggleApi()
api.authenticate()
print("Authenticated successfully!")

In [None]:
import os
import subprocess
import zipfile
from tqdm import tqdm

def download_dataset(dataset_id, download_path):
    """Download and extract a Kaggle dataset"""
    zip_path = os.path.join(download_path, f"{dataset_id.split('/')[-1]}.zip")
    
    # Create directory
    os.makedirs(download_path, exist_ok=True)
    
    # Download using Kaggle CLI
    print(f"\n⬇Downloading: {dataset_id}")
    subprocess.run(
        ["kaggle", "datasets", "download", "-d", dataset_id, "-p", download_path],
        check=True
    )
    print(f"Download completed: {os.path.basename(zip_path)}")
    
    # Extract with progress bar
    print("Extracting contents...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        total_files = len(zip_ref.infolist())
        for file in tqdm(zip_ref.infolist(), desc="Extracting", unit="file", total=total_files):
            zip_ref.extract(file, path=download_path)
    
    # Remove zip file
    os.remove(zip_path)
    print(f"Dataset ready at: {download_path}")

# Download both datasets
download_dataset("alaaeddineayadi/real-vs-ai-generated-faces", "./datasets/real_vs_ai_faces")
download_dataset("xhlulu/140k-real-and-fake-faces", "./datasets/140k_faces")

In [None]:
import os

def print_folder_tree(start_path, prefix=""):
    """Recursively print folder structure"""
    items = [item for item in os.listdir(start_path) if os.path.isdir(os.path.join(start_path, item))]
    pointers = ['├── '] * (len(items) - 1) + ['└── ']

    for pointer, item in zip(pointers, items):
        path = os.path.join(start_path, item)
        print(prefix + pointer + item)
        extension = '│   ' if pointer == '├── ' else '    '
        print_folder_tree(path, prefix + extension)

# Print structure of both datasets
print("Folder structure of 'real_vs_ai_faces':")
print_folder_tree('datasets/real_vs_ai_faces')

print("\nFolder structure of '140k_faces':")
print_folder_tree('datasets/140k_faces')

In [None]:
import os
import shutil
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def prepare_dataset():
    """Combine and balance datasets"""
    # Output folders
    output_real = 'data/real'
    output_fake = 'data/fake'
    os.makedirs(output_real, exist_ok=True)
    os.makedirs(output_fake, exist_ok=True)

    # Source directories from both datasets
    input_dirs = [
        # From 140k_faces dataset
        'datasets/140k_faces/real_vs_fake/real-vs-fake/valid/real',
        'datasets/140k_faces/real_vs_fake/real-vs-fake/valid/fake',
        
        # From real_vs_ai_faces dataset
        'datasets/real_vs_ai_faces/dataset/train/real',
        'datasets/real_vs_ai_faces/dataset/val/real',
        'datasets/real_vs_ai_faces/dataset/test/real',
        'datasets/real_vs_ai_faces/dataset/train/fake',
        'datasets/real_vs_ai_faces/dataset/val/fake',
        'datasets/real_vs_ai_faces/dataset/test/fake',
    ]

    # Copy function with filename collision handling
    def copy_file(src_path, dst_folder):
        filename = os.path.basename(src_path)
        dst_path = os.path.join(dst_folder, filename)

        base, ext = os.path.splitext(filename)
        counter = 1
        while os.path.exists(dst_path):
            dst_path = os.path.join(dst_folder, f"{base}_{counter}{ext}")
            counter += 1

        shutil.copy2(src_path, dst_path)

    # Process each folder
    def process_folder(folder, class_type):
        if not os.path.exists(folder):
            print(f"Folder not found: {folder}")
            return

        dst_folder = output_real if class_type == 'real' else output_fake
        images = [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

        with ThreadPoolExecutor() as executor:
            list(tqdm(executor.map(lambda img: copy_file(img, dst_folder), images),
                      total=len(images),
                      desc=f'Copying {class_type} from {os.path.basename(folder)}',
                      unit='img'))

    # Process all folders
    for folder in input_dirs:
        if 'real' in folder:
            process_folder(folder, 'real')
        elif 'fake' in folder:
            process_folder(folder, 'fake')

    print("\nAll images successfully combined into 'data/real' and 'data/fake'.")

prepare_dataset()

In [None]:
import os
import random
import shutil
from tqdm import tqdm

def balance_dataset():
    """Balance the dataset by moving extra files"""
    def move_extras(src_folder, max_keep, dest_folder):
        os.makedirs(dest_folder, exist_ok=True)
        files = [f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))]
        
        if len(files) <= max_keep:
            print(f"No extra files to move from {src_folder}. Total: {len(files)}")
            return

        extra_files = random.sample(files, len(files) - max_keep)

        for f in tqdm(extra_files, desc=f"Moving extras from {src_folder} to {dest_folder}", unit="file"):
            src_path = os.path.join(src_folder, f)
            dest_path = os.path.join(dest_folder, f)
            shutil.move(src_path, dest_path)

        print(f"Moved {len(extra_files)} files to {dest_folder}")

    # Balance to 5000 images per class
    move_extras("data/real", max_keep=5000, dest_folder="extra/real")
    move_extras("data/fake", max_keep=5000, dest_folder="extra/fake")

balance_dataset()

In [1]:
import os

def get_dataset_stats():
    """Calculate and display dataset statistics"""
    def count_files_and_size(folder):
        total_size = 0
        total_files = 0
        for root, _, files in os.walk(folder):
            total_files += len(files)
            for f in files:
                fp = os.path.join(root, f)
                if os.path.isfile(fp):
                    total_size += os.path.getsize(fp)
        size_mb = total_size / (1024 * 1024)
        size_gb = total_size / (1024 * 1024 * 1024)
        return total_files, size_mb, size_gb

    # Folder paths
    real_path = 'data/real'
    fake_path = 'data/fake'

    # Count files and sizes
    real_files, real_mb, real_gb = count_files_and_size(real_path)
    fake_files, fake_mb, fake_gb = count_files_and_size(fake_path)

    # Display results
    print(f"data/real: {real_files} files, {real_gb:.2f} GB ({real_mb:.2f} MB)")
    print(f"data/fake: {fake_files} files, {fake_gb:.2f} GB ({fake_mb:.2f} MB)")

get_dataset_stats()

data/real: 5000 files, 1.29 GB (1319.19 MB)
data/fake: 5000 files, 6.16 GB (6302.88 MB)
