Extract 60 Frames from Face Forensics and Pika Dataset

In [None]:
import cv2
import os
from pathlib import Path
import numpy as np
import glob
import random
import time
import shutil

def extract_fixed_frames_from_video(video_path, output_dir, num_frames_to_extract=60, frame_prefix="frame"):
    """
    Extracts a fixed number of evenly spaced frames from a video.
    Skips if video is too short or if all frames already exist.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    base_name = video_path.stem

    # Check if all frames for this video already exist to skip quickly
    expected_frames = [output_dir / f"{base_name}_{frame_prefix}{i:03d}.jpg" for i in range(num_frames_to_extract)]
    if all(f.exists() for f in expected_frames):
        # print(f"[SKIP] {base_name}: All {num_frames_to_extract} frames already exist.")
        return True # Indicate success / already done

    try:
        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened():
            print(f"[ERROR] {base_name}: Could not open video.")
            return False
        total_frames_in_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    except Exception as e:
        print(f"[ERROR] {base_name}: Error opening video: {e}")
        if 'cap' in locals() and cap.isOpened(): cap.release()
        return False

    if total_frames_in_video < num_frames_to_extract:
        cap.release()
        print(f"[SKIP] {video_path.name}: Too short ({total_frames_in_video} frames) for {num_frames_to_extract} required.")
        return False

    frame_indices = np.linspace(0, total_frames_in_video - 1, num_frames_to_extract, dtype=int)
    for i, frame_idx in enumerate(frame_indices):
        frame_filename = f"{base_name}_{frame_prefix}{i:03d}.jpg" # Use 3-digit padding for up to 999 frames/video
        out_path = output_dir / frame_filename

        # This inner check is for resuming a partially extracted video
        if out_path.exists():
            continue

        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if ret:
            try:
                cv2.imwrite(str(out_path), frame)
            except Exception as e_write:
                print(f"  [ERROR] Could not write frame {frame_filename}: {e_write}")
                # Don't continue if we can't write, might be a disk issue
                cap.release()
                return False

    print(f"  Extracted frames for {base_name}")
    cap.release()
    return True

# === Main Execution Block ===


# === PATHS ===
real_youtube_dir = Path("/content/drive/MyDrive/Faceforensics/FaceForensics++/original_sequences/youtube/c23/videos")
real_actors_dir = Path("/content/drive/MyDrive/Faceforensics/FaceForensics++/original_sequences/actors/c23/videos")
fake_dir = Path("/content/drive/MyDrive/pika_dataset")

# === VIDEO FILES ===
print("Gathering source video paths...")
real_videos = list(real_youtube_dir.glob("*.mp4")) + list(real_actors_dir.glob("*.mp4"))
fake_videos = list(fake_dir.glob("*.mp4"))
print(f"Found {len(real_videos)} real videos and {len(fake_videos)} fake videos.")

# === OUTPUT DIRECTORIES ===
# Use new, clear names for this extraction strategy
real_output_dir = Path("/content/drive/MyDrive/final_balanced_data/real_frames")
fake_output_dir = Path("/content/drive/MyDrive/final_balanced_data/fake_frames")
real_output_dir.mkdir(parents=True, exist_ok=True)
fake_output_dir.mkdir(parents=True, exist_ok=True)

# === CONFIGURATION FOR EXTRACTION ===
NUM_FRAMES_TO_EXTRACT_PER_VIDEO = 60 # A much more representative number

# === STEP 1: Balance by number of videos, then extract ===
print("\n--- Balancing by number of source videos ---")
num_real_videos = len(real_videos)
num_fake_videos = len(fake_videos)

if num_real_videos == 0 or num_fake_videos == 0:
    print("One of the video sources is empty. Cannot balance. Exiting.")
    exit()

min_video_count = min(num_real_videos, num_fake_videos)
print(f"Balancing to {min_video_count} videos per class.")

# Randomly sample from the larger set to match the smaller set
if num_real_videos > min_video_count:
    real_videos_sampled = random.sample(real_videos, min_video_count)
else:
    real_videos_sampled = real_videos

if num_fake_videos > min_video_count:
    fake_videos_sampled = random.sample(fake_videos, min_video_count)
else:
    fake_videos_sampled = fake_videos

print(f"Final video counts for processing: Real={len(real_videos_sampled)}, Fake={len(fake_videos_sampled)}")

# === STEP 2: Extract frames from the balanced video lists ===
print("\n--- Starting REAL frame extraction ---")
real_videos_processed = 0
for i, video_path in enumerate(real_videos_sampled):
    print(f"Processing real video {i+1}/{len(real_videos_sampled)}: {video_path.name}")
    success = extract_fixed_frames_from_video(video_path, real_output_dir, NUM_FRAMES_TO_EXTRACT_PER_VIDEO, "frame")
    if success:
        real_videos_processed += 1

print("\n--- Starting FAKE frame extraction ---")
fake_videos_processed = 0
for i, video_path in enumerate(fake_videos_sampled):
    print(f"Processing fake video {i+1}/{len(fake_videos_sampled)}: {video_path.name}")
    success = extract_fixed_frames_from_video(video_path, fake_output_dir, NUM_FRAMES_TO_EXTRACT_PER_VIDEO, "f")
    if success:
        fake_videos_processed += 1

# === Final Summary ===
print("\n--- Frame Extraction Summary ---")
final_real_frame_count = len(list(real_output_dir.glob('*.jpg')))
final_fake_frame_count = len(list(fake_output_dir.glob('*.jpg')))

print(f"Total REAL videos successfully processed: {real_videos_processed}/{len(real_videos_sampled)}")
print(f"Total REAL frames in {real_output_dir}: {final_real_frame_count}")
print("---")
print(f"Total FAKE videos successfully processed: {fake_videos_processed}/{len(fake_videos_sampled)}")
print(f"Total FAKE frames in {fake_output_dir}: {final_fake_frame_count}")
print("Extraction process complete.")

Gathering source video paths...


New Fake datasst downlaod, extract and zip

In [None]:
# === SCRIPT TO PROCESS, CHUNK, AND ADD-ON FRAMES MODEL-BY-MODEL ===

import os
import glob
import time
import shutil
import cv2
import numpy as np
import zipfile
import re
import math
from pathlib import Path
from google.colab import drive

# --- Configuration ---
DRIVE_SOURCE_ZIPS_DIR = "/content/drive/MyDrive/Cloned_DeepAction_Models_Zipped"
MODELS_TO_PROCESS = ["StableDiffusion.zip", "RunwayML.zip","Veo.zip","VideoPoet"]
DRIVE_OUTPUT_DIR = "/content/drive/MyDrive/zipped"
NUM_FRAMES_TO_EXTRACT = 60
CHUNK_SIZE = 10000

# --- Temporary Local Directories ---
LOCAL_TEMP_ZIPS_DIR = "/content/temp_source_zips"
LOCAL_TEMP_UNZIPPED_DIR = "/content/temp_unzipped_models"
LOCAL_EXTRACTED_FRAMES_DIR = "/content/temp_extracted_frames"

# --- Helper Function (Same as before) ---
def extract_evenly_spaced_frames(video_path, num_frames, output_folder, model_prefix, base_video_dir):
    try:
        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened(): return
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total_frames < num_frames: return
        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
        relative_path = Path(video_path).relative_to(base_video_dir)
        unique_suffix = "_".join(relative_path.parts).replace('.mp4', '')
        frame_prefix = f"{model_prefix}_{unique_suffix}"
        for i, frame_idx in enumerate(frame_indices):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if ret:
                output_filename = f"{frame_prefix}_frame_{i:03d}.jpg"
                output_path = os.path.join(output_folder, output_filename)
                cv2.imwrite(output_path, frame)
    finally:
        if 'cap' in locals() and cap.isOpened(): cap.release()

# --- Main Logic ---
if __name__ == '__main__':
    print("--- Starting Model-by-Model Frame Extraction and Add-On Chunking ---")
    drive.mount('/content/drive', force_remount=True)

    # --- Setup ---
    os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)
    os.makedirs(LOCAL_TEMP_ZIPS_DIR, exist_ok=True)

    # --- Step 1: Bulk Copy source zips ---
    print("\n--- Step 1: Copying all source zips from Drive... ---")
    for zip_name in MODELS_TO_PROCESS:
        source_path = os.path.join(DRIVE_SOURCE_ZIPS_DIR, zip_name)
        if os.path.exists(source_path) and not os.path.exists(os.path.join(LOCAL_TEMP_ZIPS_DIR, zip_name)):
            shutil.copy(source_path, LOCAL_TEMP_ZIPS_DIR)

    # --- Step 2: Process each model completely, one at a time ---
    for zip_name in MODELS_TO_PROCESS:
        model_name = Path(zip_name).stem
        print(f"\n\n{'='*20} PROCESSING MODEL: {model_name.upper()} {'='*20}")
        local_zip_path = os.path.join(LOCAL_TEMP_ZIPS_DIR, zip_name)

        # --- A. Extract frames for the current model ---
        print(f"\n  A) Extracting frames for {model_name}...")

        temp_unzip_target = os.path.join(LOCAL_TEMP_UNZIPPED_DIR, model_name)
        temp_extract_target = os.path.join(LOCAL_EXTRACTED_FRAMES_DIR, model_name)
        shutil.rmtree(temp_unzip_target, ignore_errors=True)
        shutil.rmtree(temp_extract_target, ignore_errors=True)
        os.makedirs(temp_unzip_target)
        os.makedirs(temp_extract_target)

        !unzip -q -o "{local_zip_path}" -d "{temp_unzip_target}"

        video_files = glob.glob(os.path.join(temp_unzip_target, '**', '*.mp4'), recursive=True)
        print(f"    Found {len(video_files)} videos. Starting frame extraction...")
        for i, video_path in enumerate(video_files):
            extract_evenly_spaced_frames(video_path, NUM_FRAMES_TO_EXTRACT, temp_extract_target, model_name, base_video_dir=temp_unzip_target)

        all_new_frame_paths = sorted(glob.glob(os.path.join(temp_extract_target, "*.jpg")))
        total_new_frames = len(all_new_frame_paths)
        if total_new_frames == 0:
            print(f"    WARNING: No frames extracted for {model_name}. Skipping.")
            continue
        print(f"    -> Extracted {total_new_frames:,} frames for this model.")

        # --- B. Determine Starting Point for chunking ---
        print("\n  B) Determining chunking start point from Drive...")
        existing_fake_chunks = sorted(glob.glob(os.path.join(DRIVE_OUTPUT_DIR, "fake_frames_chunk_*.zip")))
        start_chunk_num = 1
        start_file_index = 0
        if existing_fake_chunks:
            last_chunk_filename = Path(existing_fake_chunks[-1]).name
            match = re.search(r'chunk_(\d+)_(\d+)-(\d+)\.zip', last_chunk_filename)
            if match:
                last_chunk_num, _, last_file_index = map(int, match.groups())
                start_chunk_num = last_chunk_num + 1
                start_file_index = last_file_index + 1
        print(f"    -> Will start with NEW Chunk #{start_chunk_num}, indexing from file #{start_file_index}.")

        # --- C. Zip frames and upload chunks ---
        total_chunks_for_this_model = math.ceil(total_new_frames / CHUNK_SIZE)
        print(f"\n  C) Zipping and uploading {total_chunks_for_this_model} new chunk(s)...")

        for i in range(total_chunks_for_this_model):
            current_chunk_num = start_chunk_num + i
            start_slice = i * CHUNK_SIZE
            chunk_of_paths = all_new_frame_paths[start_slice : start_slice + CHUNK_SIZE]
            current_start_idx = start_file_index + (i * CHUNK_SIZE)
            current_end_idx = current_start_idx + len(chunk_of_paths) - 1
            final_zip_name = f"fake_frames_chunk_{current_chunk_num:02d}_{current_start_idx:06d}-{current_end_idx:06d}.zip"
            final_zip_path_on_drive = os.path.join(DRIVE_OUTPUT_DIR, final_zip_name)

            print(f"    --- Processing NEW FAKE Chunk #{current_chunk_num}: {final_zip_name} ---")
            if os.path.exists(final_zip_path_on_drive):
                print(f"    ✅ SKIPPING: Chunk already exists on Google Drive.")
                continue

            LOCAL_TEMP_CHUNK_DIR = f"/content/temp_chunk_{current_chunk_num}"
            os.makedirs(LOCAL_TEMP_CHUNK_DIR, exist_ok=True)
            for frame_path in chunk_of_paths:
                shutil.copy(frame_path, LOCAL_TEMP_CHUNK_DIR)
            local_zip_path = f"/content/{final_zip_name}"
            !zip -r -q -j "{local_zip_path}" "{LOCAL_TEMP_CHUNK_DIR}/"
            !mv "{local_zip_path}" "{final_zip_path_on_drive}"
            shutil.rmtree(LOCAL_TEMP_CHUNK_DIR)
            print(f"    ✅ SUCCESS: Chunk #{current_chunk_num} saved to Drive.")

        # --- D. Clean up temporary files for this model ---
        print(f"\n  D) Cleaning up temporary files for {model_name}...")
        shutil.rmtree(temp_unzip_target, ignore_errors=True)
        shutil.rmtree(temp_extract_target, ignore_errors=True)
        print(f"  --- Finished processing {model_name} ---")

    # --- Final Cleanup ---
    print("\n\n--- All models processed. Final cleanup... ---")
    shutil.rmtree(LOCAL_TEMP_ZIPS_DIR, ignore_errors=True)
    print(f"Your new fake frame chunks have been added to: '{DRIVE_OUTPUT_DIR}'")

New Real dataset extract, zipped and process

In [None]:
# === SCRIPT TO EXTRACT FRAMES FROM UCF101 AND ADD-ON TO EXISTING CHUNKS (COMPLETE VERSION) ===

import os
import glob
import time
import shutil
import cv2
import numpy as np
import math
import re
from pathlib import Path
from google.colab import drive

# --- Configuration ---
DRIVE_UCF101_ZIP_PATH = "/content/drive/MyDrive/archive.zip"
DRIVE_OUTPUT_DIR = "/content/drive/MyDrive/zipped"
NUM_FRAMES_TO_EXTRACT = 60
CHUNK_SIZE = 10000
LOCAL_UNZIPPED_VIDEOS_DIR = "/content/UCF101_videos"
LOCAL_EXTRACTED_FRAMES_DIR = "/content/extracted_ucf101_frames"
# --- End of Configuration ---

# --- Helper Function to Extract Evenly Spaced Frames ---
def extract_evenly_spaced_frames(video_path, num_frames, output_folder):
    try:
        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened():
            print(f"    - Warning: Could not open {video_path}. Skipping.")
            return []

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total_frames < num_frames:
            # print(f"    - Warning: Video {video_path} has only {total_frames} frames. Skipping.")
            return []

        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)

        saved_frame_paths = []
        video_name = Path(video_path).stem
        # Add a prefix to frame names to avoid any potential name collisions with other datasets
        video_prefix = f"ucf101_{video_name}"

        for i, frame_idx in enumerate(frame_indices):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if ret:
                output_filename = f"{video_prefix}_frame_{i:03d}.jpg"
                output_path = os.path.join(output_folder, output_filename)
                cv2.imwrite(output_path, frame)
                saved_frame_paths.append(output_path)

        cap.release()
        return saved_frame_paths
    except Exception as e:
        print(f"    - ERROR processing {video_path}: {e}")
        return []

# --- Main Execution Block ---
if __name__ == '__main__':
    print("--- Starting UCF101 Frame Extraction and Zipping (Add-On Mode) ---")
    start_time_total = time.time()
    drive.mount('/content/drive', force_remount=True)

    local_ucf101_zip = f"/content/{Path(DRIVE_UCF101_ZIP_PATH).name}"

    # --- Step 1: Copy and Unzip the Dataset (Resumable) ---
    print("\n--- Step 1: Preparing Local UCF101 Video Dataset ---")
    if not os.path.isdir(LOCAL_UNZIPPED_VIDEOS_DIR):
        if not os.path.exists(local_ucf101_zip):
            print(f"  Copying {DRIVE_UCF101_ZIP_PATH} to local storage...")
            !cp "{DRIVE_UCF101_ZIP_PATH}" /content/
        else:
            print("  Local zip file found. Skipping copy.")

        print(f"  Unzipping dataset to '{LOCAL_UNZIPPED_VIDEOS_DIR}'...")
        !unzip -q "{local_ucf101_zip}" -d "{LOCAL_UNZIPPED_VIDEOS_DIR}"
        print("  Unzipping complete.")
    else:
        print("✅ SKIPPING: Local unzipped directory already exists.")

    # --- Step 2: Extract 60 Frames from Each Video (Resumable) ---
    print("\n--- Step 2: Extracting Frames ---")
    os.makedirs(LOCAL_EXTRACTED_FRAMES_DIR, exist_ok=True)

    all_video_files = glob.glob(os.path.join(LOCAL_UNZIPPED_VIDEOS_DIR, '**', '*.avi'), recursive=True)
    print(f"Found {len(all_video_files)} AVI video files to process.")

    # Check if we already have extracted frames to avoid re-extracting everything
    existing_frames_check = glob.glob(os.path.join(LOCAL_EXTRACTED_FRAMES_DIR, "*.jpg"))
    # Heuristic: if more frames exist than videos, assume extraction was done.
    if len(existing_frames_check) > len(all_video_files):
        print("✅ SKIPPING: Found a large number of existing frames. Assuming extraction is complete.")
        all_extracted_frame_paths = sorted(existing_frames_check)
    else:
        all_extracted_frame_paths = []
        for i, video_path in enumerate(all_video_files):
            if (i + 1) % 500 == 0:
                print(f"  Processing video {i+1}/{len(all_video_files)}...")
            frame_paths = extract_evenly_spaced_frames(video_path, NUM_FRAMES_TO_EXTRACT, LOCAL_EXTRACTED_FRAMES_DIR)
            all_extracted_frame_paths.extend(frame_paths)
        all_extracted_frame_paths = sorted(all_extracted_frame_paths)
        print(f"-> Frame extraction complete. Total new frames created: {len(all_extracted_frame_paths)}")

    if not all_extracted_frame_paths:
        print("❌ ERROR: Frame extraction resulted in zero files. Cannot proceed. Please check video paths and formats.")
        import sys
        sys.exit()

    # --- MODIFIED STEP 3: Determine Starting Point and Zip in Add-On Chunks ---
    print("\n--- Step 3: Zipping and Uploading in Add-On Chunks ---")
    os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)

    print("  Scanning existing 'real_frames' chunks on Drive to determine starting point...")
    existing_real_chunks = sorted(glob.glob(os.path.join(DRIVE_OUTPUT_DIR, "real_frames_chunk_*.zip")))

    start_chunk_num = 1
    start_file_index = 0

    if existing_real_chunks:
        last_chunk_filename = Path(existing_real_chunks[-1]).name
        print(f"  Last existing chunk found: {last_chunk_filename}")
        match = re.search(r'chunk_(\d+)_(\d+)-(\d+)\.zip', last_chunk_filename)
        if match:
            last_chunk_num = int(match.group(1))
            last_file_index = int(match.group(3))
            start_chunk_num = last_chunk_num + 1
            start_file_index = last_file_index + 1
            print(f"  --> Will start with NEW Chunk #{start_chunk_num}, indexing from file #{start_file_index}.")
        else:
            print("  Warning: Could not parse existing chunk names. Starting from chunk 1.")
    else:
        print("  No existing 'real_frames' chunks found. Starting from chunk 1.")

    total_new_frames = len(all_extracted_frame_paths)
    total_new_chunks = math.ceil(total_new_frames / CHUNK_SIZE)
    print(f"  Will create {total_new_chunks} new chunk(s) for the {total_new_frames} new frames.")

    for i in range(total_new_chunks):
        current_chunk_num = start_chunk_num + i
        start_slice_index = i * CHUNK_SIZE
        end_slice_index = start_slice_index + CHUNK_SIZE
        chunk_of_paths = all_extracted_frame_paths[start_slice_index:end_slice_index]

        current_start_file_index = start_file_index + (i * CHUNK_SIZE)
        current_end_file_index = current_start_file_index + len(chunk_of_paths) - 1

        final_zip_name = f"real_frames_chunk_{current_chunk_num:02d}_{current_start_file_index:06d}-{current_end_file_index:06d}.zip"
        final_zip_path_on_drive = os.path.join(DRIVE_OUTPUT_DIR, final_zip_name)

        print(f"\n--- Processing NEW Chunk #{current_chunk_num}: {final_zip_name} ---")

        if os.path.exists(final_zip_path_on_drive):
            print(f"✅ SKIPPING: This chunk already exists on Google Drive.")
            continue

        LOCAL_TEMP_CHUNK_DIR = f"/content/temp_chunk_{current_chunk_num}"
        shutil.rmtree(LOCAL_TEMP_CHUNK_DIR, ignore_errors=True)
        os.makedirs(LOCAL_TEMP_CHUNK_DIR)

        print(f"  Copying {len(chunk_of_paths)} frames to temp chunk directory...")
        for frame_path in chunk_of_paths:
            shutil.copy(frame_path, LOCAL_TEMP_CHUNK_DIR)

        print("  Zipping chunk locally...")
        local_zip_path = f"/content/{final_zip_name}"
        !zip -r -q -j "{local_zip_path}" "{LOCAL_TEMP_CHUNK_DIR}/"

        print("  Moving zip file to Google Drive...")
        !mv "{local_zip_path}" "{final_zip_path_on_drive}"

        shutil.rmtree(LOCAL_TEMP_CHUNK_DIR)
        print(f"✅ SUCCESS: Chunk #{current_chunk_num} saved to Drive.")

    # --- Step 4: Final Cleanup ---
    print("\n--- Step 4: Final Cleanup of Large Local Files ---")
    print(f"  Deleting local unzipped videos: {LOCAL_UNZIPPED_VIDEOS_DIR}")
    shutil.rmtree(LOCAL_UNZIPPED_VIDEOS_DIR, ignore_errors=True)
    print(f"  Deleting local extracted frames: {LOCAL_EXTRACTED_FRAMES_DIR}")
    shutil.rmtree(LOCAL_EXTRACTED_FRAMES_DIR, ignore_errors=True)
    if os.path.exists(local_ucf101_zip):
        print(f"  Deleting local zip file: {local_ucf101_zip}")
        os.remove(local_ucf101_zip)

    end_time_total = time.time()
    print(f"\n\n--- All tasks completed in {end_time_total - start_time_total:.2f} seconds. ---")

--- Starting UCF101 Frame Extraction and Zipping (Add-On Mode) ---
Mounted at /content/drive

--- Step 1: Preparing Local UCF101 Video Dataset ---
  Copying /content/drive/MyDrive/archive.zip to local storage...
  Unzipping dataset to '/content/UCF101_videos'...
  Unzipping complete.

--- Step 2: Extracting Frames ---
Found 13451 AVI video files to process.
  Processing video 500/13451...
  Processing video 1000/13451...
  Processing video 1500/13451...
  Processing video 2000/13451...
  Processing video 2500/13451...
  Processing video 3000/13451...
  Processing video 3500/13451...
  Processing video 4000/13451...
  Processing video 4500/13451...
  Processing video 5000/13451...
  Processing video 5500/13451...
  Processing video 6000/13451...
  Processing video 6500/13451...
  Processing video 7000/13451...
  Processing video 7500/13451...
  Processing video 8000/13451...
  Processing video 8500/13451...
  Processing video 9000/13451...
  Processing video 9500/13451...
  Processing v

In [None]:
!fusermount -u /content/drive
!rm -rf /content/drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


fusermount: failed to unmount /content/drive: Invalid argument
Mounted at /content/drive


In [None]:
import time
import os
import shutil # Import shutil for rmtree

# In a Colab cell, before your main script
from google.colab import drive

# Attempt to unmount and flush. If it fails, it means it wasn't mounted
# or there was an issue. We'll handle the directory cleanup next.
try:
    drive.flush_and_unmount()
    print("Attempted Drive flush and unmount.")
except ValueError:
    print("Drive was not mounted or could not be unmounted. Proceeding to directory cleanup.")
except Exception as e:
    print(f"An error occurred during flush/unmount: {e}. Proceeding to directory cleanup.")


time.sleep(2) # Short pause after unmounting attempt

# Explicitly remove the mountpoint directory if it exists and is not empty
mountpoint = '/content/drive'
if os.path.exists(mountpoint):
    print(f"Cleaning up mountpoint directory: {mountpoint}")
    try:
        # Use shutil.rmtree to remove directory and its contents recursively
        shutil.rmtree(mountpoint)
        print(f"Successfully removed {mountpoint}")
    except OSError as e:
        print(f"Error removing mountpoint {mountpoint}: {e}")
        # If removal fails, we might still have issues. Exit gracefully or raise error.
        # For now, just print error and proceed, mount might still fail.
        pass # Continue attempting to mount even if cleanup failed

# Recreate the mountpoint directory as an empty directory for mounting
os.makedirs(mountpoint, exist_ok=True)
print(f"Ensured mountpoint directory exists and is empty: {mountpoint}")

time.sleep(2) # Short pause before mounting

# Now attempt to mount
try:
    drive.mount(mountpoint, force_remount=True)
    print("Drive remounted successfully.")
except Exception as e:
    print(f"FATAL: Could not mount Google Drive after cleanup. Error: {e}. Please check permissions and try again.")

Attempted Drive flush and unmount.
Ensured mountpoint directory exists and is empty: /content/drive
Mounted at /content/drive
Drive remounted successfully.


Computing Optical Flow

In [None]:
import cv2
import numpy as np
import os
from pathlib import Path
import torch
import glob
from collections import defaultdict
import time
import re
import gc
import random
from google.colab import drive

# --- Configuration ---
# Target sequence length for this preprocessing step
SEQUENCE_LENGTH = 30
# Step size for creating sequences. Set to SEQUENCE_LENGTH for non-overlapping.
STEP_SIZE = 30

# --- PATHS ---
# INPUT: The directories with your extracted 60 frames per video
INPUT_REAL_FRAMES_DIR = '/content/drive/MyDrive/final_balanced_data/real_frames'
INPUT_FAKE_FRAMES_DIR = '/content/drive/MyDrive/final_balanced_data/fake_frames'

# OUTPUT: A directory for the new flow features
DRIVE_PRECOMPUTED_FLOW_DIR = f'/content/drive/MyDrive/final_balanced_data/precomputed_flow{SEQUENCE_LENGTH}_SS{STEP_SIZE}'
os.makedirs(DRIVE_PRECOMPUTED_FLOW_DIR, exist_ok=True)
print(f"Precomputed optical flow sequences will be saved to: {DRIVE_PRECOMPUTED_FLOW_DIR}")

# --- Mount Google Drive ---

try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted.")
except Exception as e:
    print(f"FATAL: Could not mount Google Drive. Error: {e}. Exiting.")
    # In a notebook, this will just stop the cell.
    exit()



# --- Helper Functions ---
def group_frames_by_video(frame_paths, class_label_for_debug=""):
    """Groups frame paths based on the video stem in their filename."""
    video_groups = defaultdict(list)
    for path_str in frame_paths:
        path_obj = Path(str(path_str))
        stem = path_obj.stem
        # Use the corrected regex to handle 'frame' or 'f' prefixes
        match = re.match(r'^(.*?)_(?:frame|f)(\d+)', stem)
        if match:
            video_stem = match.group(1)
        else:
            # Fallback if the naming convention is ever different
            print(f"WARN: Could not parse video stem from filename '{stem}'. Using stem as fallback.")
            video_stem = stem
        video_groups[video_stem].append(str(path_str))

    # Sort frames within each group by their frame number
    for stem_key in list(video_groups.keys()):
        try:
            video_groups[stem_key] = sorted(
                video_groups[stem_key],
                key=lambda x: int(re.match(r'^(.*?)_(?:frame|f)(\d+)', Path(x).stem).group(2))
            )
        except: # Fallback to alphanumeric sort if number parsing fails
            video_groups[stem_key] = sorted(video_groups[stem_key])

    print(f"  Grouped {len(frame_paths)} frames into {len(video_groups)} video groups for '{class_label_for_debug}'.")
    return video_groups

def compute_optical_flow_for_one_sequence(bgr_frames_list, target_sequence_length):
    """Computes flow for a single sequence of BGR frames."""
    flow_features_for_seq = [np.array([0.0, 0.0], dtype=np.float32)] # First flow is 0
    num_pairs_to_compute = len(bgr_frames_list) - 1
    for i in range(num_pairs_to_compute):
        prev_bgr_frame, curr_bgr_frame = bgr_frames_list[i], bgr_frames_list[i+1]
        if prev_bgr_frame is None or curr_bgr_frame is None:
            flow_features_for_seq.append(np.array([0.0, 0.0], dtype=np.float32)); continue
        try:
            prev_gray = cv2.cvtColor(prev_bgr_frame, cv2.COLOR_BGR2GRAY)
            curr_gray = cv2.cvtColor(curr_bgr_frame, cv2.COLOR_BGR2GRAY)
            if prev_gray.shape != curr_gray.shape:
                flow_features_for_seq.append(np.array([0.0, 0.0], dtype=np.float32)); continue
            flow_cv = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
            if flow_cv is not None:
                magnitude = np.sqrt(flow_cv[..., 0]**2 + flow_cv[..., 1]**2)
                flow_features_for_seq.append(np.array([np.mean(magnitude), np.var(magnitude)], dtype=np.float32) if magnitude.size > 0 else np.array([0.0,0.0], dtype=np.float32))
            else:
                flow_features_for_seq.append(np.array([0.0, 0.0], dtype=np.float32))
        except Exception:
            flow_features_for_seq.append(np.array([0.0, 0.0], dtype=np.float32))
    return torch.tensor(np.array(flow_features_for_seq), dtype=torch.float32)

# --- Main Preprocessing Function ---
# --- Main Preprocessing Function (Modified for better logging) ---
def preprocess_and_save_flow(all_sequences_to_process, save_dir, sequence_len):
    """
    Takes a list of sequence info tuples and processes them,
    logging when it moves to a new source video.
    """
    total_sequences_generated_this_run = 0
    print(f"Attempting to generate flow for {len(all_sequences_to_process)} sequences...")

    last_video_stem_processed = None # Keep track of the last video

    for seq_idx, (frame_paths, sequence_identifier) in enumerate(all_sequences_to_process):
        # Extract the video stem from the sequence_identifier for logging
        # Assumes format like: "real_VIDEOSTEM_seqX"
        current_video_stem = "_".join(sequence_identifier.split('_')[1:-1])

        # If the video stem has changed, print a header for the new video
        if current_video_stem != last_video_stem_processed:
            print(f"\n  Now processing sequences from video: '{current_video_stem}'")
            last_video_stem_processed = current_video_stem

        # More targeted log for the sequence itself
        if (seq_idx + 1) % 10 == 0 or seq_idx < 5: # Log first 5 and then every 10th
            print(f"    - Processing sequence {seq_idx + 1}/{len(all_sequences_to_process)} ('{sequence_identifier}')...")

        save_path = os.path.join(save_dir, f"{sequence_identifier}_flow.pt")

        if os.path.exists(save_path):
            continue # Skip if already exists (resume logic)

        # Load BGR frames for THIS sequence
        seq_bgr_frames = []
        for frame_path in frame_paths:
            frame = cv2.imread(frame_path)
            if frame is not None:
                seq_bgr_frames.append(frame)

        # Ensure we have the correct number of frames before computing flow
        if len(seq_bgr_frames) == sequence_len:
            flow_tensor = compute_optical_flow_for_one_sequence(seq_bgr_frames, sequence_len)
            if flow_tensor is not None and flow_tensor.shape == (sequence_len, 2):
                torch.save(flow_tensor, save_path)
                total_sequences_generated_this_run += 1

        del seq_bgr_frames
        if (seq_idx + 1) % 200 == 0:
            gc.collect()

    print(f"--- Finished precomputing. Generated {total_sequences_generated_this_run} new flow sequences. ---")

# --- Main Execution Block ---
if __name__ == '__main__':
    print("********************************************************************************")
    print("*** Optical Flow Preprocessing for Balanced, Fixed-Frame-Count Datasets      ***")
    print(f"*** Target Sequence Length for Flow: {SEQUENCE_LENGTH}, Step Size: {STEP_SIZE}            ***")
    print("********************************************************************************")

    print(f"\n--- Loading Frame Paths from balanced frame directories ---")
    img_patterns = ["*.[jJ][pP][gG]", "*.[jJ][pP][eE][gG]", "*.[pP][nN][gG]"]
    all_real_frame_paths = sorted([p for pattern in img_patterns for p in glob.glob(os.path.join(INPUT_REAL_FRAMES_DIR, '**', pattern), recursive=True)])
    all_fake_frame_paths = sorted([p for pattern in img_patterns for p in glob.glob(os.path.join(INPUT_FAKE_FRAMES_DIR, '**', pattern), recursive=True)])
    print(f"Found {len(all_real_frame_paths)} total real frames and {len(all_fake_frame_paths)} total fake frames.")

    print("\n--- Grouping frames and creating sequences ---")
    real_video_groups = group_frames_by_video(all_real_frame_paths, "real")
    fake_video_groups = group_frames_by_video(all_fake_frame_paths, "fake")

    all_sequences_to_process_info = []
    # For REAL videos
    for video_stem, frames_list in real_video_groups.items():
        if len(frames_list) < SEQUENCE_LENGTH: continue
        for i in range(0, len(frames_list) - SEQUENCE_LENGTH + 1, STEP_SIZE):
            seq_paths = frames_list[i : i + SEQUENCE_LENGTH]
            if len(seq_paths) == SEQUENCE_LENGTH:
                safe_stem = re.sub(r'[\\/*?:"<>|]',"_", str(video_stem))
                seq_id = f"real_{safe_stem}_seq{i//STEP_SIZE}"
                all_sequences_to_process_info.append((seq_paths, seq_id, "real"))
    # For FAKE videos
    for video_stem, frames_list in fake_video_groups.items():
        if len(frames_list) < SEQUENCE_LENGTH: continue
        for i in range(0, len(frames_list) - SEQUENCE_LENGTH + 1, STEP_SIZE):
            seq_paths = frames_list[i : i + SEQUENCE_LENGTH]
            if len(seq_paths) == SEQUENCE_LENGTH:
                safe_stem = re.sub(r'[\\/*?:"<>|]',"_", str(video_stem))
                seq_id = f"fake_{safe_stem}_seq{i//STEP_SIZE}"
                all_sequences_to_process_info.append((seq_paths, seq_id, "fake"))

    print("\n--- Balancing at the sequence level ---")
    real_sequences = [item for item in all_sequences_to_process_info if item[2] == "real"]
    fake_sequences = [item for item in all_sequences_to_process_info if item[2] == "fake"]
    print(f"Generated {len(real_sequences)} real sequences and {len(fake_sequences)} fake sequences.")

    final_real_sequences_to_process = []
    final_fake_sequences_to_process = []

    if not real_sequences or not fake_sequences:
        print("One class has zero sequences. Cannot balance. Processing all available.")
        final_real_sequences_to_process = [(item[0], item[1]) for item in real_sequences]
        final_fake_sequences_to_process = [(item[0], item[1]) for item in fake_sequences]
    else:
        min_seq_count = min(len(real_sequences), len(fake_sequences))
        print(f"Balancing to {min_seq_count} sequences per class.")

        random.seed(42) # For reproducible sampling
        real_sequences_sampled = random.sample(real_sequences, min_seq_count)
        fake_sequences_sampled = random.sample(fake_sequences, min_seq_count)

        final_real_sequences_to_process = [(item[0], item[1]) for item in real_sequences_sampled]
        final_fake_sequences_to_process = [(item[0], item[1]) for item in fake_sequences_sampled]

    # --- MODIFICATION: Sort the lists before processing ---
    # Sorting by the sequence identifier (the second element of the tuple)
    # This will group '..._seq0', '..._seq1', etc. for the same video together.
    final_real_sequences_to_process.sort(key=lambda x: x[1])
    final_fake_sequences_to_process.sort(key=lambda x: x[1])

    print(f"Total REAL sequences to process (sorted): {len(final_real_sequences_to_process)}")
    print(f"Total FAKE sequences to process (sorted): {len(final_fake_sequences_to_process)}")

    # --- Run the main processing function SEQUENTIALLY for each class ---
    if final_real_sequences_to_process:
        print("\n--- Starting Optical Flow Computation for REAL sequences ---")
        preprocess_and_save_flow(
            final_real_sequences_to_process,
            DRIVE_PRECOMPUTED_FLOW_DIR,
            SEQUENCE_LENGTH
        )
    else:
        print("\nNo REAL sequences to process.")

    gc.collect() # Garbage collect between the two large processing tasks

    if final_fake_sequences_to_process:
        print("\n--- Starting Optical Flow Computation for FAKE sequences ---")
        preprocess_and_save_flow(
            final_fake_sequences_to_process,
            DRIVE_PRECOMPUTED_FLOW_DIR,
            SEQUENCE_LENGTH
        )
    else:
        print("\nNo FAKE sequences to process.")

    print("\n--- Optical Flow Preprocessing is DONE ---")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

  Now processing sequences from video: '330'

  Now processing sequences from video: '331'
    - Processing sequence 1390/2726 ('real_331_seq1')...

  Now processing sequences from video: '332'

  Now processing sequences from video: '333'

  Now processing sequences from video: '334'

  Now processing sequences from video: '335'

  Now processing sequences from video: '336'
    - Processing sequence 1400/2726 ('real_336_seq1')...

  Now processing sequences from video: '337'

  Now processing sequences from video: '338'

  Now processing sequences from video: '339'

  Now processing sequences from video: '340'

  Now processing sequences from video: '341'
    - Processing sequence 1410/2726 ('real_341_seq1')...

  Now processing sequences from video: '342'

  Now processing sequences from video: '343'

  Now processing sequences from video: '344'

  Now processing sequences from video: '345'

  Now processing sequences 

Processing Flow for New dataset

In [None]:
# === SCRIPT TO PROCESS A SUBSET OF DATA: SELECT FIRST 10 CHUNKS, BALANCE, AND COMPUTE FLOW ===

import os
import glob
import time
import shutil
import cv2
import numpy as np
import math
import re
import gc
import random
from pathlib import Path
from google.colab import drive
from collections import defaultdict
import torch

# --- Configuration ---
# 1. The Drive folder where ALL your frame chunks are stored
DRIVE_ZIPPED_FRAMES_DIR = "/content/drive/MyDrive/New_Dataset"

# 2. Number of chunks to select PER CLASS
NUM_CHUNKS_TO_SELECT = 10

# 3. Parameters for sequence creation
SEQUENCE_LENGTH = 30
STEP_SIZE = 30

# 4. The NEW output directory on Drive for this smaller experiment
DRIVE_OUTPUT_DIR = "/content/drive/MyDrive/New_Dataset"

# 5. Local directories
LOCAL_TEMP_ZIPS_DIR = "/content/temp_selected_zips"
LOCAL_FRAMES_DIR = "/content/local_selected_frames"
# --- End of Configuration ---


# --- Mount Drive and Setup Directories ---
drive.mount('/content/drive', force_remount=True)
os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)
os.makedirs(LOCAL_TEMP_ZIPS_DIR, exist_ok=True)
LOCAL_REAL_FRAMES_DIR = os.path.join(LOCAL_FRAMES_DIR, "real_frames")
LOCAL_FAKE_FRAMES_DIR = os.path.join(LOCAL_FRAMES_DIR, "fake_frames")
os.makedirs(LOCAL_REAL_FRAMES_DIR, exist_ok=True)
os.makedirs(LOCAL_FAKE_FRAMES_DIR, exist_ok=True)
print(f"Precomputed optical flow will be saved to: {DRIVE_OUTPUT_DIR}")


# --- Helper Functions (Identical to your original script) ---
def group_frames_by_video(frame_paths, class_label_for_debug=""):
    # ... (This function remains the same) ...
    video_groups = defaultdict(list)
    for path_str in frame_paths:
        stem = Path(str(path_str)).stem
        # Simplified regex to be more robust
        match = re.match(r'^(.*?)_frame_(\d+)', stem)
        if match:
            video_stem = match.group(1)
        else: # Fallback for different naming
            video_stem = "_".join(stem.split('_')[:-2])
        video_groups[video_stem].append(str(path_str))
    for stem_key in list(video_groups.keys()):
        try:
            video_groups[stem_key] = sorted(video_groups[stem_key], key=lambda x: int(Path(x).stem.split('_')[-1]))
        except: video_groups[stem_key] = sorted(video_groups[stem_key])
    print(f"  Grouped {len(frame_paths)} frames into {len(video_groups)} video groups for '{class_label_for_debug}'.")
    return video_groups

def compute_optical_flow_for_one_sequence(bgr_frames_list, target_sequence_length):
    # ... (This function remains the same) ...
    flow_features_for_seq = [np.array([0.0, 0.0], dtype=np.float32)]
    for i in range(len(bgr_frames_list) - 1):
        try:
            prev_gray = cv2.cvtColor(bgr_frames_list[i], cv2.COLOR_BGR2GRAY)
            curr_gray = cv2.cvtColor(bgr_frames_list[i+1], cv2.COLOR_BGR2GRAY)
            flow_cv = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
            magnitude = np.sqrt(flow_cv[..., 0]**2 + flow_cv[..., 1]**2)
            flow_features_for_seq.append(np.array([np.mean(magnitude), np.var(magnitude)], dtype=np.float32))
        except Exception:
            flow_features_for_seq.append(np.array([0.0, 0.0], dtype=np.float32))
    return torch.tensor(np.array(flow_features_for_seq), dtype=torch.float32)

def preprocess_and_save_flow(all_sequences_to_process, save_dir, sequence_len):
    # ... (This function remains the same) ...
    total_sequences_generated_this_run = 0
    print(f"Attempting to generate flow for {len(all_sequences_to_process)} sequences...")
    last_video_stem_processed = None
    for seq_idx, (frame_paths, sequence_identifier) in enumerate(all_sequences_to_process):
        current_video_stem = "_".join(sequence_identifier.split('_')[1:-1])
        if current_video_stem != last_video_stem_processed:
            print(f"\n  Now processing sequences from video: '{current_video_stem}'")
            last_video_stem_processed = current_video_stem
        save_path = os.path.join(save_dir, f"{sequence_identifier}_flow.pt")
        if os.path.exists(save_path): continue
        seq_bgr_frames = [cv2.imread(fp) for fp in frame_paths if cv2.imread(fp) is not None]
        if len(seq_bgr_frames) == sequence_len:
            flow_tensor = compute_optical_flow_for_one_sequence(seq_bgr_frames, sequence_len)
            if flow_tensor is not None and flow_tensor.shape == (sequence_len, 2):
                torch.save(flow_tensor, save_path)
                total_sequences_generated_this_run += 1
    print(f"--- Finished precomputing. Generated {total_sequences_generated_this_run} new flow sequences. ---")


# --- Main Execution Block ---
if __name__ == '__main__':
    print("--- Starting Data Subsetting and Optical Flow Preprocessing ---")

    # --- Step 1: Select, Copy, and Unzip the FIRST 10 Chunks of Data ---
    print("\n--- Step 1: Preparing Local Dataset from the first 10 chunks per class ---")

    # --- Process REAL Frames ---
    print("\n  Processing REAL frame chunks...")
    all_real_chunks = sorted(glob.glob(os.path.join(DRIVE_ZIPPED_FRAMES_DIR, "real_frames_chunk_*.zip")))
    if not all_real_chunks:
        print("  Warning: No 'real_frames' chunks found.")
        selected_real_chunks = []
    else:
        # --- THIS IS THE KEY CHANGE ---
        # Select the first N chunks from the sorted list instead of a random sample
        selected_real_chunks = all_real_chunks[:NUM_CHUNKS_TO_SELECT]

    print(f"  Selected the following {len(selected_real_chunks)} chunks to process:")
    for path in selected_real_chunks: print(f"    - {Path(path).name}")

    for chunk_path in selected_real_chunks:
        shutil.copy(chunk_path, LOCAL_TEMP_ZIPS_DIR)

    for zip_path in glob.glob(os.path.join(LOCAL_TEMP_ZIPS_DIR, "real_frames_chunk_*.zip")):
        !unzip -q -o "{zip_path}" -d "{LOCAL_REAL_FRAMES_DIR}"
    shutil.rmtree(LOCAL_TEMP_ZIPS_DIR) # Clean up after real
    os.makedirs(LOCAL_TEMP_ZIPS_DIR)  # Recreate for fake

    # --- Process FAKE Frames ---
    print("\n  Processing FAKE frame chunks...")
    all_fake_chunks = sorted(glob.glob(os.path.join(DRIVE_ZIPPED_FRAMES_DIR, "fake_frames_chunk_*.zip")))
    if not all_fake_chunks:
        print("  Warning: No 'fake_frames' chunks found.")
        selected_fake_chunks = []
    else:
        # --- THIS IS THE KEY CHANGE ---
        selected_fake_chunks = all_fake_chunks[:NUM_CHUNKS_TO_SELECT]

    print(f"  Selected the following {len(selected_fake_chunks)} chunks to process:")
    for path in selected_fake_chunks: print(f"    - {Path(path).name}")

    for chunk_path in selected_fake_chunks:
        shutil.copy(chunk_path, LOCAL_TEMP_ZIPS_DIR)

    for zip_path in glob.glob(os.path.join(LOCAL_TEMP_ZIPS_DIR, "fake_frames_chunk_*.zip")):
        !unzip -q -o "{zip_path}" -d "{LOCAL_FAKE_FRAMES_DIR}"
    shutil.rmtree(LOCAL_TEMP_ZIPS_DIR) # Final cleanup

    # --- Step 2: Load Frames and Create Sequences (Logic is the same) ---
    print("\n--- Step 2: Loading frames and creating sequences from the local subset ---")
    all_real_frame_paths = sorted(glob.glob(os.path.join(LOCAL_REAL_FRAMES_DIR, '*.jpg')))
    all_fake_frame_paths = sorted(glob.glob(os.path.join(LOCAL_FAKE_FRAMES_DIR, '*.jpg')))

    # ... (The rest of the script for grouping, sequencing, balancing, and flow computation is identical) ...
    real_video_groups = group_frames_by_video(all_real_frame_paths, "real")
    fake_video_groups = group_frames_by_video(all_fake_frame_paths, "fake")

    all_sequences_info = []
    for video_stem, frames_list in real_video_groups.items():
        for i in range(0, len(frames_list) - SEQUENCE_LENGTH + 1, STEP_SIZE):
            seq_paths = frames_list[i : i + SEQUENCE_LENGTH]
            if len(seq_paths) == SEQUENCE_LENGTH:
                seq_id = f"real_{video_stem}_seq{i//STEP_SIZE}"
                all_sequences_info.append((seq_paths, seq_id, "real"))
    for video_stem, frames_list in fake_video_groups.items():
        for i in range(0, len(frames_list) - SEQUENCE_LENGTH + 1, STEP_SIZE):
            seq_paths = frames_list[i : i + SEQUENCE_LENGTH]
            if len(seq_paths) == SEQUENCE_LENGTH:
                seq_id = f"fake_{video_stem}_seq{i//STEP_SIZE}"
                all_sequences_info.append((seq_paths, seq_id, "fake"))

    # --- Step 3: Balance and Compute Flow (Logic is the same) ---
    print("\n--- Step 3: Balancing sequences and preparing for flow computation ---")
    real_sequences = [item for item in all_sequences_info if item[2] == "real"]
    fake_sequences = [item for item in all_sequences_info if item[2] == "fake"]
    min_seq_count = min(len(real_sequences), len(fake_sequences))
    print(f"Balancing to {min_seq_count} sequences per class.")

    random.seed(42)
    real_sequences_sampled = random.sample(real_sequences, min_seq_count)
    fake_sequences_sampled = random.sample(fake_sequences, min_seq_count)

    final_real_sequences = sorted([(item[0], item[1]) for item in real_sequences_sampled], key=lambda x: x[1])
    final_fake_sequences = sorted([(item[0], item[1]) for item in fake_sequences_sampled], key=lambda x: x[1])

    print("\n--- Starting Optical Flow Computation for REAL sequences ---")
    preprocess_and_save_flow(final_real_sequences, DRIVE_OUTPUT_DIR, SEQUENCE_LENGTH)

    gc.collect()

    print("\n--- Starting Optical Flow Computation for FAKE sequences ---")
    preprocess_and_save_flow(final_fake_sequences, DRIVE_OUTPUT_DIR, SEQUENCE_LENGTH)

    print("\n--- Optical Flow Preprocessing is DONE ---")

Mounted at /content/drive
Precomputed optical flow will be saved to: /content/drive/MyDrive/New_Dataset
--- Starting Data Subsetting and Optical Flow Preprocessing ---

--- Step 1: Preparing Local Dataset from the first 10 chunks per class ---

  Processing REAL frame chunks...
  Selected the following 10 chunks to process:
    - real_frames_chunk_10_090000-099999.zip
    - real_frames_chunk_11_100000-109999.zip
    - real_frames_chunk_12_110000-119999.zip
    - real_frames_chunk_13_120000-129999.zip
    - real_frames_chunk_14_130000-139999.zip
    - real_frames_chunk_15_140000-149999.zip
    - real_frames_chunk_16_150000-159999.zip
    - real_frames_chunk_17_160000-169999.zip
    - real_frames_chunk_18_170000-179999.zip
    - real_frames_chunk_19_180000-189999.zip

  Processing FAKE frame chunks...
  Selected the following 8 chunks to process:
    - fake_frames_chunk_12_110000-119999.zip
    - fake_frames_chunk_13_120000-129999.zip
    - fake_frames_chunk_14_130000-133999.zip
    - fa

Adding New Flow to Old Flow File


In [None]:
# === SCRIPT TO RECOMBINE, FLATTEN, AND RE-ZIP OLD AND NEW FLOW FILES ===

import os
import glob
import time
import shutil
import zipfile
from pathlib import Path
from google.colab import drive

# --- Configuration ---
# 1. Path to your OLD flow zip file
DRIVE_OLD_FLOW_ZIP_PATH = "/content/drive/MyDrive/Old_Flow_Might_Use/precomputed_flow30_SS30.zip"

# 2. Path to your NEW flow zip file
DRIVE_NEW_FLOW_ZIP_PATH = "/content/drive/MyDrive/New_Dataset/new_precomputed_flow.zip"

# 3. The destination on Drive for the FINAL MERGED zip file
DRIVE_FINAL_OUTPUT_DIR = "/content/drive/MyDrive/zipped"

# 4. The name for the final, combined zip file
FINAL_ZIP_NAME = "Combined_Precomputed_Flow.zip"

# 5. Temporary local directories
LOCAL_TEMP_ZIPS_DIR = "/content/temp_zips_for_recombine"
LOCAL_TEMP_UNZIPPED_DIR = "/content/temp_unzipped_for_recombine"
LOCAL_FINAL_MERGED_DIR = "/content/Final_Combined_Flow"
# --- End of Configuration ---


# --- Main Logic ---
print("--- Starting Robust Re-Combination Process for Flow Files ---")
drive.mount('/content/drive', force_remount=True)

final_zip_path_on_drive = os.path.join(DRIVE_FINAL_OUTPUT_DIR, FINAL_ZIP_NAME)

# --- Pre-computation Check ---
# Ask to delete the old file if it exists, to ensure a fresh start.
if os.path.exists(final_zip_path_on_drive):
    print(f"\nWARNING: The destination file '{FINAL_ZIP_NAME}' already exists.")
    user_input = input("Do you want to delete it and re-create it? (yes/no): ")
    if user_input.lower() == 'yes':
        print(f"Deleting old file: {final_zip_path_on_drive}")
        os.remove(final_zip_path_on_drive)
    else:
        print("Operation cancelled by user. Exiting.")
        import sys; sys.exit()

# --- Step 1: Create local directories ---
os.makedirs(LOCAL_TEMP_ZIPS_DIR, exist_ok=True)
os.makedirs(LOCAL_TEMP_UNZIPPED_DIR, exist_ok=True)
os.makedirs(LOCAL_FINAL_MERGED_DIR, exist_ok=True)


# --- Step 2: Copy both source zips locally ---
print("\n--- Step 2: Copying source zip files locally... ---")
# ... (Same copy logic as before) ...
source_zips = [DRIVE_OLD_FLOW_ZIP_PATH, DRIVE_NEW_FLOW_ZIP_PATH]
copied_zips = []
for path in source_zips:
    if os.path.exists(path):
        shutil.copy(path, LOCAL_TEMP_ZIPS_DIR)
        copied_zips.append(os.path.join(LOCAL_TEMP_ZIPS_DIR, Path(path).name))


# --- Step 3: Unzip SEPARATELY and then Consolidate ---
print(f"\n--- Step 3: Unzipping, finding, and consolidating all .pt files... ---")
all_found_pt_files = []
for zip_file_path in copied_zips:
    model_name = Path(zip_file_path).stem
    temp_extract_path = os.path.join(LOCAL_TEMP_UNZIPPED_DIR, model_name)
    os.makedirs(temp_extract_path, exist_ok=True)

    print(f"  Unzipping '{Path(zip_file_path).name}' into a temporary folder...")
    !unzip -q -o "{zip_file_path}" -d "{temp_extract_path}"

    # Recursively find all .pt files inside this specific extraction
    search_pattern = os.path.join(temp_extract_path, '**', '*.pt')
    found_files = glob.glob(search_pattern, recursive=True)
    print(f"    -> Found {len(found_files):,} files in this archive.")
    all_found_pt_files.extend(found_files)

# Now, move all found files into the single, flat, final directory
print("\n  Moving all found files into the final consolidated directory...")
for file_path in all_found_pt_files:
    shutil.move(file_path, LOCAL_FINAL_MERGED_DIR)

# --- Step 4: Verify the consolidated folder ---
print("\n--- Step 4: Verifying the final merged folder... ---")
final_file_count = len(glob.glob(os.path.join(LOCAL_FINAL_MERGED_DIR, '*.pt')))
print("\n==============================================")
print(f"  ✅ VERIFICATION COMPLETE")
print(f"  Total files in the consolidated folder: {final_file_count:,}")
print("==============================================")
if final_file_count < 9000: # Sanity check
    print("WARNING: Final count is lower than expected. Please check the source zips.")


# --- Step 5: Zip the final, clean, consolidated folder ---
print(f"\n--- Step 5: Creating the final merged zip file '{FINAL_ZIP_NAME}'... ---")
local_final_zip_path = f"/content/{FINAL_ZIP_NAME}"
with zipfile.ZipFile(local_final_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file in os.listdir(LOCAL_FINAL_MERGED_DIR):
        zipf.write(os.path.join(LOCAL_FINAL_MERGED_DIR, file), arcname=file)
print("-> Final zip file created.")


# --- Step 6: Upload the final zip to Google Drive ---
print(f"\n--- Step 6: Uploading '{FINAL_ZIP_NAME}' to Google Drive... ---")
shutil.move(local_final_zip_path, final_zip_path_on_drive)
print("-> Upload complete.")


# --- Step 7: Final Cleanup ---
print("\n--- Step 7: Cleaning up all temporary local files... ---")
shutil.rmtree(LOCAL_TEMP_ZIPS_DIR, ignore_errors=True)
shutil.rmtree(LOCAL_TEMP_UNZIPPED_DIR, ignore_errors=True)
shutil.rmtree(LOCAL_FINAL_MERGED_DIR, ignore_errors=True)

print(f"\n\n--- Process finished successfully! ---")
print(f"✅ Your new, clean, and complete flow archive is saved at: '{final_zip_path_on_drive}'")

--- Starting Robust Re-Combination Process for Flow Files ---
Mounted at /content/drive

--- Step 2: Copying source zip files locally... ---

--- Step 3: Unzipping, finding, and consolidating all .pt files... ---
  Unzipping 'precomputed_flow30_SS30.zip' into a temporary folder...
    -> Found 5,452 files in this archive.
  Unzipping 'new_precomputed_flow.zip' into a temporary folder...
    -> Found 3,820 files in this archive.

  Moving all found files into the final consolidated directory...

--- Step 4: Verifying the final merged folder... ---

  ✅ VERIFICATION COMPLETE
  Total files in the consolidated folder: 9,272

--- Step 5: Creating the final merged zip file 'Combined_Precomputed_Flow.zip'... ---
-> Final zip file created.

--- Step 6: Uploading 'Combined_Precomputed_Flow.zip' to Google Drive... ---
-> Upload complete.

--- Step 7: Cleaning up all temporary local files... ---


--- Process finished successfully! ---
✅ Your new, clean, and complete flow archive is saved at: '/c

In [None]:
!pip install huggingface_hub




In [None]:
# === CROSS-DATASET PREPROCESSING: REAL + FAKE (FRAMES + FLOW, SEPARATE ZIPS) ===
import os
import cv2
import numpy as np
import torch
import random
import glob
import zipfile
import shutil
from pathlib import Path
from google.colab import drive

# --- Config ---
DRIVE_REAL_ZIP = "/content/drive/MyDrive/Test_Dataset/Real/Real.zip"
DRIVE_FAKE_ZIP = "/content/drive/MyDrive/Test_Dataset/Fake/Fake.zip"
LOCAL_REAL_DIR = "/content/Real_Unzipped"
LOCAL_FAKE_DIR = "/content/Fake_Unzipped"

OUTPUT_FRAMES_DIR = "/content/Frames_Output"
OUTPUT_FLOW_DIR = "/content/Flow_Output"

FINAL_REAL_FRAMES_ZIP = "/content/drive/MyDrive/Test_Dataset/Real/Frames_Real.zip"
FINAL_FAKE_FRAMES_ZIP = "/content/drive/MyDrive/Test_Dataset/Fake/Frames_Fake.zip"
FINAL_REAL_FLOW_ZIP   = "/content/drive/MyDrive/Test_Dataset/Real/Flow_Real.zip"
FINAL_FAKE_FLOW_ZIP   = "/content/drive/MyDrive/Test_Dataset/Fake/Flow_Fake.zip"

NUM_VIDEOS = 250
FRAMES_PER_VIDEO = 60
SEQ_LENGTH = 30
STEP_SIZE = 30

# --- Mount Drive ---
drive.mount('/content/drive', force_remount=True)

# --- Prepare Output Dirs ---
shutil.rmtree(OUTPUT_FRAMES_DIR, ignore_errors=True)
shutil.rmtree(OUTPUT_FLOW_DIR, ignore_errors=True)
os.makedirs(os.path.join(OUTPUT_FRAMES_DIR, "Real"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_FRAMES_DIR, "Fake"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_FLOW_DIR, "Real"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_FLOW_DIR, "Fake"), exist_ok=True)

# --- Helper: Unzip dataset ---
def unzip_to_local(zip_path, extract_dir):
    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"✅ Extracted {zip_path} to {extract_dir}")
    else:
        print(f"ℹ️ Using existing extracted folder: {extract_dir}")

# --- Helper: Extract frames ---
def extract_and_save_frames(video_path, save_folder, num_frames=60):
    os.makedirs(save_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if frame_count <= 0:
        cap.release()
        return []

    indices = np.linspace(0, frame_count - 1, num=num_frames, dtype=int)
    saved_paths = []
    for idx_i, idx in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            continue
        frame_name = f"{Path(video_path).stem}_frame_{idx_i:03d}.jpg"
        frame_path = os.path.join(save_folder, frame_name)
        cv2.imwrite(frame_path, frame)
        saved_paths.append(frame_path)
    cap.release()
    return saved_paths

# --- Helper: Optical Flow ---
def compute_optical_flow(frames):
    flows = [np.array([0.0, 0.0], dtype=np.float32)]
    for i in range(len(frames) - 1):
        prev_gray = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
        next_gray = cv2.cvtColor(frames[i+1], cv2.COLOR_BGR2GRAY)
        flow_cv = cv2.calcOpticalFlowFarneback(prev_gray, next_gray, None,
                                               0.5, 3, 15, 3, 5, 1.2, 0)
        mag = np.sqrt(flow_cv[..., 0]**2 + flow_cv[..., 1]**2)
        flows.append(np.array([np.mean(mag), np.var(mag)], dtype=np.float32))
    return torch.tensor(np.array(flows), dtype=torch.float32)

# --- Process videos (Real or Fake) ---
def process_videos(extract_dir, label):
    video_paths = sorted(glob.glob(os.path.join(extract_dir, "**", "*.mp4"), recursive=True))
    selected = random.sample(video_paths, min(NUM_VIDEOS, len(video_paths)))
    print(f"Processing {len(selected)} {label} videos...")

    for vid_idx, vid_path in enumerate(selected):
        vid_name = Path(vid_path).stem
        frame_save_dir = os.path.join(OUTPUT_FRAMES_DIR, label, vid_name)
        frame_paths = extract_and_save_frames(vid_path, frame_save_dir, FRAMES_PER_VIDEO)

        if len(frame_paths) < FRAMES_PER_VIDEO:
            print(f"⚠️ Skipping {vid_name}, not enough frames.")
            continue

        # Load frames for flow
        frames = [cv2.imread(fp) for fp in frame_paths]

        # 30-frame sequences
        for i in range(0, len(frames) - SEQ_LENGTH + 1, STEP_SIZE):
            seq_frames = frames[i:i+SEQ_LENGTH]
            if len(seq_frames) == SEQ_LENGTH:
                flow_tensor = compute_optical_flow(seq_frames)
                save_name = f"{label.lower()}_{vid_name}_seq{i//STEP_SIZE}.pt"
                save_path = os.path.join(OUTPUT_FLOW_DIR, label, save_name)
                torch.save(flow_tensor, save_path)

        if (vid_idx + 1) % 20 == 0:
            print(f"  Processed {vid_idx+1}/{len(selected)} {label} videos")

# --- Main ---
print("=== Starting Preprocessing for Real + Fake Videos ===")
unzip_to_local(DRIVE_REAL_ZIP, LOCAL_REAL_DIR)
unzip_to_local(DRIVE_FAKE_ZIP, LOCAL_FAKE_DIR)

process_videos(LOCAL_REAL_DIR, "Real")
process_videos(LOCAL_FAKE_DIR, "Fake")

# --- Zip frames and flow separately ---
shutil.make_archive("/content/Frames_Real", 'zip', os.path.join(OUTPUT_FRAMES_DIR, "Real"))
shutil.make_archive("/content/Frames_Fake", 'zip', os.path.join(OUTPUT_FRAMES_DIR, "Fake"))
shutil.make_archive("/content/Flow_Real", 'zip', os.path.join(OUTPUT_FLOW_DIR, "Real"))
shutil.make_archive("/content/Flow_Fake", 'zip', os.path.join(OUTPUT_FLOW_DIR, "Fake"))

# --- Move zips to Drive ---
shutil.move("/content/Frames_Real.zip", FINAL_REAL_FRAMES_ZIP)
shutil.move("/content/Frames_Fake.zip", FINAL_FAKE_FRAMES_ZIP)
shutil.move("/content/Flow_Real.zip", FINAL_REAL_FLOW_ZIP)
shutil.move("/content/Flow_Fake.zip", FINAL_FAKE_FLOW_ZIP)

print("✅ Done! Saved zips to Drive:")
print("   ", FINAL_REAL_FRAMES_ZIP)
print("   ", FINAL_FAKE_FRAMES_ZIP)
print("   ", FINAL_REAL_FLOW_ZIP)
print("   ", FINAL_FAKE_FLOW_ZIP)


Mounted at /content/drive
=== Starting Preprocessing for Real + Fake Videos ===
ℹ️ Using existing extracted folder: /content/Real_Unzipped
ℹ️ Using existing extracted folder: /content/Fake_Unzipped
Processing 250 Real videos...
  Processed 20/250 Real videos
  Processed 40/250 Real videos
  Processed 60/250 Real videos
  Processed 80/250 Real videos
  Processed 100/250 Real videos
  Processed 120/250 Real videos
  Processed 140/250 Real videos
  Processed 160/250 Real videos
  Processed 180/250 Real videos
  Processed 200/250 Real videos
  Processed 220/250 Real videos
  Processed 240/250 Real videos
Processing 250 Fake videos...
  Processed 20/250 Fake videos
  Processed 40/250 Fake videos
  Processed 60/250 Fake videos
  Processed 80/250 Fake videos
  Processed 100/250 Fake videos
  Processed 120/250 Fake videos
  Processed 140/250 Fake videos
  Processed 160/250 Fake videos
  Processed 180/250 Fake videos
  Processed 200/250 Fake videos
  Processed 220/250 Fake videos
  Processed 24

In [None]:
import os
import cv2
import numpy as np
import torch
import random
import glob
import zipfile
import shutil
from pathlib import Path
from google.colab import drive

# --- Config ---
DRIVE_FAKE_ZIP = "/content/drive/MyDrive/Test_Dataset/Fake/Fake.zip"
LOCAL_FAKE_DIR = "/content/Fake_Unzipped"

OUTPUT_FRAMES_DIR = "/content/Frames_Output/Fake"
OUTPUT_FLOW_DIR = "/content/Flow_Output/Fake"

FINAL_FAKE_FRAMES_ZIP = "/content/drive/MyDrive/Test_Dataset/Fake/Frames_Fake.zip"
FINAL_FAKE_FLOW_ZIP   = "/content/drive/MyDrive/Test_Dataset/Fake/Flow_Fake.zip"

NUM_VIDEOS = 250
FRAMES_PER_VIDEO = 60
SEQ_LENGTH = 30
STEP_SIZE = 30

# --- Mount Drive ---
drive.mount('/content/drive', force_remount=True)

# --- Prepare Output Dirs ---
shutil.rmtree(OUTPUT_FRAMES_DIR, ignore_errors=True)
shutil.rmtree(OUTPUT_FLOW_DIR, ignore_errors=True)
os.makedirs(OUTPUT_FRAMES_DIR, exist_ok=True)
os.makedirs(OUTPUT_FLOW_DIR, exist_ok=True)

# --- Helper: Unzip dataset ---
def unzip_to_local(zip_path, extract_dir):
    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"✅ Extracted {zip_path} to {extract_dir}")
    else:
        print(f"ℹ️ Using existing extracted folder: {extract_dir}")

# --- Helper: Extract frames ---
def extract_and_save_frames(video_path, save_folder, num_frames=60):
    os.makedirs(save_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if frame_count <= 0:
        cap.release()
        return []

    indices = np.linspace(0, frame_count - 1, num=num_frames, dtype=int)
    saved_paths = []
    for idx_i, idx in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            continue
        frame_name = f"{Path(video_path).stem}_frame_{idx_i:03d}.jpg"
        frame_path = os.path.join(save_folder, frame_name)
        cv2.imwrite(frame_path, frame)
        saved_paths.append(frame_path)
    cap.release()
    return saved_paths

# --- Helper: Optical Flow ---
def compute_optical_flow(frames):
    flows = [np.array([0.0, 0.0], dtype=np.float32)]
    for i in range(len(frames) - 1):
        prev_gray = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
        next_gray = cv2.cvtColor(frames[i+1], cv2.COLOR_BGR2GRAY)
        flow_cv = cv2.calcOpticalFlowFarneback(prev_gray, next_gray, None,
                                               0.5, 3, 15, 3, 5, 1.2, 0)
        mag = np.sqrt(flow_cv[..., 0]**2 + flow_cv[..., 1]**2)
        flows.append(np.array([np.mean(mag), np.var(mag)], dtype=np.float32))
    return torch.tensor(np.array(flows), dtype=torch.float32)

# --- Process videos ---
def process_fake_videos(extract_dir):
    video_paths = sorted(glob.glob(os.path.join(extract_dir, "**", "*.mp4"), recursive=True))
    selected = random.sample(video_paths, min(NUM_VIDEOS, len(video_paths)))
    print(f"Processing {len(selected)} Fake videos...")

    for vid_idx, vid_path in enumerate(selected):
        vid_name = Path(vid_path).stem
        # Make unique name for each video
        unique_vid_name = f"{vid_idx:03d}_{vid_name}"

        frame_save_dir = os.path.join(OUTPUT_FRAMES_DIR, unique_vid_name)
        frame_paths = extract_and_save_frames(vid_path, frame_save_dir, FRAMES_PER_VIDEO)

        if len(frame_paths) < FRAMES_PER_VIDEO:
            print(f"⚠️ Skipping {unique_vid_name}, not enough frames.")
            continue

        frames = [cv2.imread(fp) for fp in frame_paths]

        for i in range(0, len(frames) - SEQ_LENGTH + 1, STEP_SIZE):
            seq_frames = frames[i:i+SEQ_LENGTH]
            if len(seq_frames) == SEQ_LENGTH:
                flow_tensor = compute_optical_flow(seq_frames)
                save_name = f"fake_{unique_vid_name}_seq{i//STEP_SIZE}.pt"
                save_path = os.path.join(OUTPUT_FLOW_DIR, save_name)
                torch.save(flow_tensor, save_path)

        if (vid_idx + 1) % 20 == 0:
            print(f"  Processed {vid_idx+1}/{len(selected)} Fake videos")

# --- Main ---
print("=== Starting Preprocessing for Fake Videos ===")
unzip_to_local(DRIVE_FAKE_ZIP, LOCAL_FAKE_DIR)
process_fake_videos(LOCAL_FAKE_DIR)

# --- Zip frames and flow separately ---
shutil.make_archive("/content/Frames_Fake", 'zip', OUTPUT_FRAMES_DIR)
shutil.make_archive("/content/Flow_Fake", 'zip', OUTPUT_FLOW_DIR)

# --- Move zips to Drive ---
shutil.move("/content/Frames_Fake.zip", FINAL_FAKE_FRAMES_ZIP)
shutil.move("/content/Flow_Fake.zip", FINAL_FAKE_FLOW_ZIP)

print("✅ Done! Saved zips to Drive:")
print("   ", FINAL_FAKE_FRAMES_ZIP)
print("   ", FINAL_FAKE_FLOW_ZIP)


Mounted at /content/drive
=== Starting Preprocessing for Fake Videos ===
✅ Extracted /content/drive/MyDrive/Test_Dataset/Fake/Fake.zip to /content/Fake_Unzipped
Processing 250 Fake videos...
  Processed 20/250 Fake videos
  Processed 40/250 Fake videos
  Processed 60/250 Fake videos
  Processed 80/250 Fake videos
  Processed 100/250 Fake videos
  Processed 120/250 Fake videos
  Processed 140/250 Fake videos
  Processed 160/250 Fake videos
  Processed 180/250 Fake videos
  Processed 200/250 Fake videos
  Processed 220/250 Fake videos
  Processed 240/250 Fake videos
✅ Done! Saved zips to Drive:
    /content/drive/MyDrive/Test_Dataset/Fake/Frames_Fake.zip
    /content/drive/MyDrive/Test_Dataset/Fake/Flow_Fake.zip
