**1. INSTALL DEPENDENCIES**

In [None]:

!pip install PyWavelets scikit-image pandas tenacity numpy scipy tqdm

Collecting PyWavelets
  Downloading pywavelets-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Downloading pywavelets-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyWavelets
Successfully installed PyWavelets-1.8.0


**Imports**

In [None]:
"""Import all necessary libraries"""
import os
import pandas as pd
import numpy as np
import shutil
import logging
import time
from skimage import io as skio
from skimage import color, measure, feature
from skimage.util import random_noise, img_as_float
from scipy import ndimage as ndi
from skimage.restoration import estimate_sigma
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import glob
from google.colab import drive
from IPython.display import Javascript, display

**3. MOUNT GOOGLE DRIVE**

In [3]:
"""Mount Google Drive to access image folders"""
if not os.path.exists('/content/drive'):
  drive.mount('/content/drive')
else:
  print("Drive already mounted.")

Mounted at /content/drive


**Configurations**

In [4]:
# List all folders containing your images (update these paths)
IMAGE_FOLDERS = [
    '/content/drive/MyDrive/Deepfake_Images/real_train',
    '/content/drive/MyDrive/Deepfake_Images/real_test',
    '/content/drive/MyDrive/Deepfake_Images/fake_train',
    '/content/drive/MyDrive/Deepfake_Images/fake_test',

]

# Output directory for results
OUTPUT_DIR = "/content/drive/MyDrive/Deepfake_Dataset"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# File paths for tracking progress
PROCESSED_FILES_LOG = os.path.join(OUTPUT_DIR, "processed_files.log")
BACKUP_DIR = os.path.join(OUTPUT_DIR, "backups")
os.makedirs(BACKUP_DIR, exist_ok=True)

# Final output files
COMBINED_OUTPUT = os.path.join(OUTPUT_DIR, "all_features_combined.csv")

**5. ANTI-DISCONNECT**

In [5]:
"""Prevent Colab from disconnecting during long runs"""
display(Javascript('''
function KeepAlive(){
    console.log("Session active");
    google.colab.kernel.proxyPort(5000, {})
}
setInterval(KeepAlive, 60*1000);
'''))

<IPython.core.display.Javascript object>

**6. FEATURE EXTRACTION**

In [15]:
def extract_features(img_path):
    """
    Extracts the following features from an image:
    - entropy
    - wrapped phase range
    - noise estimate
    - blur measure
    - keypoint count
    - blob count
    - label (real/fake)

    Returns: Dictionary with all 7 features or None if failed
    """
    try:
        # Read image
        img = skio.imread(img_path)
        if img is None:
            logging.warning(f"Could not read image: {img_path}")
            return None

        # Convert to grayscale for some features
        gray_img = color.rgb2gray(img)

        # 1. Calculate entropy
        entropy = measure.shannon_entropy(img)

        # 2. Calculate wrapped phase range
        image_wrapped = np.angle(np.exp(1j * img))
        wrapped = np.max(image_wrapped) - np.min(image_wrapped)

        # 3. Estimate noise level
        astro = img_as_float(img)[30:180, 150:300]  # Sample a region
        noisy = random_noise(astro, var=0.08**2)
        noise = np.mean(estimate_sigma(noisy, channel_axis=-1))

        # 4. Measure blur (average of multiple filter sizes)
        blur = np.mean([ndi.uniform_filter(img, size=k) for k in range(2, 32, 2)])

        # 5. Count keypoints using CENSURE detector
        detector = feature.CENSURE()
        detector.detect(gray_img)
        keypoints = len(detector.keypoints)

        # 6. Count blobs using Difference of Gaussian
        blobs = len(feature.blob_dog(gray_img, max_sigma=1, threshold=0.1))

        # 7. Determine label from path
        label = 'real' if 'real' in img_path.lower() else 'fake'

        return {
            'file_path': img_path,
            'entropy': entropy,
            'wrapped': wrapped,
            'noise': noise,
            'blur': blur,
            'keypoints': keypoints,
            'blobs': blobs,
            'label': label
        }

    except Exception as e:
        logging.error(f"Error processing {img_path}: {str(e)}")
        return None

** HELPER FUNCTIONS**

In [16]:

"""Supporting functions for processing and tracking"""

def get_folder_features_path(folder_path):
    """Generate output CSV path for a specific folder"""
    folder_name = os.path.basename(folder_path)
    return os.path.join(OUTPUT_DIR, f"{folder_name}_features.csv")

def load_processed_files():
    """Load set of already processed files from log"""
    processed = set()
    if os.path.exists(PROCESSED_FILES_LOG):
        with open(PROCESSED_FILES_LOG, 'r') as f:
            processed = set(line.strip() for line in f)
    return processed

def save_folder_results(folder_path, results):
    """Save results without file_path column"""
    output_path = get_folder_features_path(folder_path)
    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)

def update_processed_log(results):
    """Update the log of processed files"""
    with open(PROCESSED_FILES_LOG, 'a') as f:
        for r in results:
            f.write(f"{r['file_path']}\n")

def combine_all_results():
    """Combine all individual folder CSVs into one master file"""
    all_dfs = []
    for folder in IMAGE_FOLDERS:
        csv_path = get_folder_features_path(folder)
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            all_dfs.append(df)

    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        combined_df.to_csv(COMBINED_OUTPUT, index=False)
        logging.info(f"Combined results saved to {COMBINED_OUTPUT}")
    else:
        logging.warning("No individual folder results found to combine")

**PARALLEL PROCESSING**

In [17]:
def process_folder(folder_path):
    """Process all images in a single folder"""
    if not os.path.exists(folder_path):
        logging.warning(f"Folder not found: {folder_path}")
        return []

    # Get all image paths in this folder
    extensions = ('*.png', '*.jpg', '*.jpeg')
    image_paths = []
    for ext in extensions:
        image_paths.extend(glob.glob(os.path.join(folder_path, '**/' + ext), recursive=True))

    if not image_paths:
        logging.warning(f"No images found in {folder_path}")
        return []

    # Filter out already processed images
    processed = load_processed_files()
    to_process = [img for img in image_paths if img not in processed]

    if not to_process:
        logging.info(f"All images in {folder_path} already processed")
        return []

    # Process in batches
    batch_size = 500
    all_results = []
    for i in range(0, len(to_process), batch_size):
        batch = to_process[i:i+batch_size]
        logging.info(f"Processing batch {i//batch_size + 1} in {folder_path} ({len(batch)} images)")

        with Pool(cpu_count()) as pool:
            results = list(tqdm(pool.imap(extract_features, batch), total=len(batch)))

        valid_results = [r for r in results if r is not None]
        if valid_results:
            all_results.extend(valid_results)
            update_processed_log(valid_results)

        # Clean up memory
        del results
        import gc; gc.collect()

    return all_results

def process_all_folders():
    """Process all folders and save individual + combined results"""
    for folder in IMAGE_FOLDERS:
        logging.info(f"Starting processing for {folder}")
        results = process_folder(folder)

        if results:
            save_folder_results(folder, results)

    # Combine all results after processing all folders
    combine_all_results()
    logging.info("All processing complete!")

**EXECUTION**

In [18]:
if __name__ == "__main__":
    process_all_folders()

100%|██████████| 500/500 [02:03<00:00,  4.05it/s]
100%|██████████| 500/500 [02:06<00:00,  3.95it/s]
100%|██████████| 500/500 [01:13<00:00,  6.81it/s]
100%|██████████| 500/500 [01:15<00:00,  6.61it/s]
100%|██████████| 500/500 [01:11<00:00,  6.96it/s]
100%|██████████| 500/500 [01:09<00:00,  7.16it/s]
100%|██████████| 500/500 [01:11<00:00,  7.04it/s]
100%|██████████| 500/500 [01:12<00:00,  6.94it/s]
100%|██████████| 500/500 [01:11<00:00,  7.01it/s]
100%|██████████| 500/500 [01:09<00:00,  7.15it/s]
100%|██████████| 500/500 [01:12<00:00,  6.93it/s]
100%|██████████| 500/500 [01:12<00:00,  6.88it/s]
100%|██████████| 500/500 [01:14<00:00,  6.70it/s]
100%|██████████| 500/500 [01:10<00:00,  7.13it/s]
100%|██████████| 500/500 [01:11<00:00,  6.95it/s]
100%|██████████| 500/500 [01:11<00:00,  6.95it/s]
100%|██████████| 500/500 [01:11<00:00,  6.98it/s]
100%|██████████| 500/500 [01:10<00:00,  7.14it/s]
100%|██████████| 500/500 [01:10<00:00,  7.13it/s]
100%|██████████| 500/500 [01:11<00:00,  7.02it/s]
