# 01 - Data Preprocessing (Local)
This notebook performs dataset splitting, frame sampling (8 frames/video), and face cropping. It uses helper scripts in src/data/.
Be sure to select the conda environment with Torch + CUDA in the kernel.


In [1]:
import torch
print(torch.__version__, torch.cuda.is_available())


2.9.1+cu128 True


In [2]:
import os
from pathlib import Path
import yaml
import json
import random
import shutil
from pprint import pprint

PROJECT_ROOT = Path.cwd()  # ensure you open notebook from repo root
print("Project root:", PROJECT_ROOT)

# Paths (edit if your raw videos live elsewhere)
RAW_DATA_DIR = PROJECT_ROOT / "data_raw"   # create this and put video subsets inside
PREPROC_FRAMES_DIR = PROJECT_ROOT / "preprocessed" / "frames"
PREPROC_FACES_DIR = PROJECT_ROOT / "preprocessed" / "faces"
EMB_DIR = PROJECT_ROOT / "embeddings"
CKPT_DIR = PROJECT_ROOT / "checkpoints"
LOG_DIR = PROJECT_ROOT / "logs"

for p in [PREPROC_FRAMES_DIR, PREPROC_FACES_DIR, EMB_DIR, CKPT_DIR, LOG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Example: expected structure under data_raw:
# data_raw/dfdc_subset/*.mp4
# data_raw/ffpp_c23/*.mp4


Project root: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\notebooks


In [3]:
# Deterministic split script (similar to your Colab split)
import glob, math, hashlib

def deterministic_split(video_paths, seed=42, train_frac=0.80, val_frac=0.15, reserve_count=200):
    random.seed(seed)
    vids = sorted(video_paths)
    random.shuffle(vids)
    # remove reserved 200 for internal test
    if reserve_count > 0:
        reserved = vids[-reserve_count:]
        vids = vids[:-reserve_count]
    else:
        reserved = []
    n = len(vids)
    n_train = int(n * train_frac)
    n_val = int(n * val_frac)
    train = vids[:n_train]
    val = vids[n_train:n_train+n_val]
    test_internal = vids[n_train+n_val:]
    return train, val, test_internal, reserved

# Collect raw videos from data_raw
dfdc_videos = list((PROJECT_ROOT / "data_raw" / "dfdc_subset").glob("*.mp4"))
ffpp_videos = list((PROJECT_ROOT / "data_raw" / "ffpp_c23").glob("*.mp4"))
all_videos = dfdc_videos + ffpp_videos
print("Found total videos:", len(all_videos))

train, val, test_internal, reserved = deterministic_split(all_videos, seed=42, reserve_count=200)
print("train:", len(train), "val:", len(val), "test_internal:", len(test_internal), "reserved:", len(reserved))

# Save lists
(Path("data") ).mkdir(exist_ok=True)
def save_list(paths, out_file):
    with open(out_file, "w") as f:
        for p in paths:
            f.write(str(p.resolve()) + "\n")

save_list(train, "data/train.txt")
save_list(val, "data/val.txt")
save_list(test_internal, "data/test_internal.txt")
save_list(reserved, "data/reserved_200.txt")

# Update data_manifest.yaml minimally
manifest = {
    "datasets": {
        "dfdc_subset": {"source": str((PROJECT_ROOT / "data_raw" / "dfdc_subset").resolve()), "videos": len(dfdc_videos)},
        "ffpp_c23": {"source": str((PROJECT_ROOT / "data_raw" / "ffpp_c23").resolve()), "videos": len(ffpp_videos)},
    },
    "splits": {
        "train": len(train),
        "val": len(val),
        "test_internal": len(test_internal),
        "reserved": len(reserved)
    }
}
with open("data_manifest.yaml", "w") as f:
    yaml.dump(manifest, f)
print("Wrote data_manifest.yaml")
pprint(manifest)


Found total videos: 0
train: 0 val: 0 test_internal: 0 reserved: 0
Wrote data_manifest.yaml
{'datasets': {'dfdc_subset': {'source': 'C:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS '
                                        'Code\\DeepFake_Detection_SIC\\notebooks\\data_raw\\dfdc_subset',
                              'videos': 0},
              'ffpp_c23': {'source': 'C:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS '
                                     'Code\\DeepFake_Detection_SIC\\notebooks\\data_raw\\ffpp_c23',
                           'videos': 0}},
 'splits': {'reserved': 0, 'test_internal': 0, 'train': 0, 'val': 0}}


In [4]:
# Use helper script extract_frames.py for each split. We'll call it programmatically here.
from subprocess import run
from pathlib import Path

EXTRACT_SCRIPT = PROJECT_ROOT / "src" / "data" / "extract_frames.py"
assert EXTRACT_SCRIPT.exists(), "Missing extract_frames.py in src/data"

def run_extract(list_file, out_dir, n=8):
    with open(list_file) as f:
        vids = [line.strip() for line in f if line.strip()]
    print("Extracting frames for", len(vids), "videos to", out_dir)
    for v in vids:
        cmd = ["python", str(EXTRACT_SCRIPT), "--src", str(Path(v).parent), "--out", str(Path(out_dir)/Path(v).stem), "--n", str(n), "--ext", Path(v).suffix.replace(".", "")]
        # But extract_frames expects folder src; to keep simple call it on each file:
        cmd = ["python", str(EXTRACT_SCRIPT), "--src", str(v), "--out", str(Path(out_dir)/Path(v).stem), "--n", str(n), "--ext", str(v.split(".")[-1])]
        run(cmd, check=True)

# Run for a small subset first to smoke-test
run_extract("data/train.txt", PREPROC_FRAMES_DIR / "train", n=8)
# Optionally run for val/test when smoke-test OK


AssertionError: Missing extract_frames.py in src/data

In [None]:
# Cropping faces from frames using MTCNN
CROP_SCRIPT = PROJECT_ROOT / "src" / "data" / "face_crop.py"
assert CROP_SCRIPT.exists()
# Crop faces for the subset we created in PREPROC_FRAMES_DIR / "train"
run(["python", str(CROP_SCRIPT), "--frames_root", str(PREPROC_FRAMES_DIR / "train"), "--out_root", str(PREPROC_FACES_DIR / "train"), "--size", "224"])
# Repeat for val/test when everything ok


In [None]:
# Check how many faces per video and write small manifest
from pathlib import Path
faces_root = PREPROC_FACES_DIR / "train"
video_dirs = [p for p in faces_root.iterdir() if p.is_dir()]
summary = {}
for v in video_dirs:
    count = len(list(v.glob("*_face.jpg")))
    summary[v.name] = count
print("Videos with faces:", len(video_dirs))
# print first 10 entries
for k in list(summary.keys())[:10]:
    print(k, summary[k])

# Save small manifest
with open("preprocessed/face_manifest_train.txt","w") as f:
    for v in video_dirs:
        f.write(f"{v.name},{len(list(v.glob('*_face.jpg')))}\n")
print("Wrote preprocessed/face_manifest_train.txt")
