# 01 - Simple dataset split
This notebook creates deterministic train/val/test splits and a labels.json mapping.
It expects your preprocessed face-only folders to be present under the project root:
- preprocessed/DFDC_REAL_Face_only_data/
- preprocessed/DFDC_FAKE_Face_only_data/
- preprocessed/FF_Face_only_data/  (optional metadata.csv)
- preprocessed/Celeb_real_face_only/  (optional: not used for training)
- preprocessed/Celeb_fake_face_only/  (optional: not used for training)


In [None]:
from pathlib import Path
import random
import json
import yaml
import pandas as pd   # used only if ffpp metadata present

# --------- USER EDIT ---------
ROOT = Path.cwd().parent
print("ROOT:", ROOT)
PREPROC = ROOT / "Dataset"
print("PREPROC:", PREPROC) # where your preprocessed face-folders live
OUT_DIR = ROOT / "data"         # outputs written here
RESERVE_COUNT = 200             # internal reserved set
SEED = 42                       # deterministic seed
TRAIN_FRAC = 0.80
VAL_FRAC = 0.15
# -----------------------------

# dataset folders (update names only if your folders are different)
DFDC_REAL = PREPROC / "DFDC_REAL_Face_only_data"
DFDC_FAKE = PREPROC / "DFDC_FAKE_Face_only_data"
FFPP_DIR = PREPROC / "FF_Face_only_data"          # optional
FFPP_META = FFPP_DIR / "metadata.csv"             # optional file
CELEB_REAL = PREPROC / "Celeb_real_face_only"     # optional (held-out)
CELEB_FAKE = PREPROC / "Celeb_fake_face_only"     # optional (held-out)

OUT_DIR.mkdir(exist_ok=True)


ROOT: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC
PREPROC: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\Dataset


In [None]:
def list_videos(folder: Path):
    """Return list of Path objects. If folder contains subfolders per video, prefer those (use subfolder name)."""
    if not folder.exists():
        return []
    # if subfolders exist, treat each subfolder as one 'video' (preprocessed face folder)
    subdirs = [p for p in folder.iterdir() if p.is_dir()]
    if len(subdirs) > 0:
        # use subdir path as representative; label mapping will use subdir.name as key
        return [p for p in subdirs]
    # else fallback: list files directly
    files = [p for p in folder.glob("*") if p.is_file()]
    return files

entries = []  # each entry: dict { "path": str, "stem": str, "label": 0/1, "source": name }

# DFDC real
for p in list_videos(DFDC_REAL):
    entries.append({"path": str(p), "stem": p.stem, "label": 0, "source": "dfdc_real"})

# DFDC fake
for p in list_videos(DFDC_FAKE):
    entries.append({"path": str(p), "stem": p.stem, "label": 1, "source": "dfdc_fake"})

# FF++ (try metadata first if present, else infer from parent or filename)
ff_meta = {}
if FFPP_META.exists():
    try:
        df = pd.read_csv(FFPP_META)
        # try commonly named columns
        # prefer columns named 'video' and 'label' (if they exist)
        if 'video' in df.columns and 'label' in df.columns:
            for _, row in df.iterrows():
                ff_meta[str(Path(row['video']).stem)] = int(row['label'])
        else:
            # build a flexible mapping: find first two meaningful columns
            # (simple heuristic)
            cols = list(df.columns)
            keycol = cols[0]
            labcol = None
            for c in cols[1:]:
                if any(x in c.lower() for x in ['label','fake','class','manipulated']):
                    labcol = c
                    break
            if labcol is None:
                labcol = cols[1] if len(cols) > 1 else None
            if labcol:
                for _, row in df.iterrows():
                    ff_meta[str(Path(row[keycol]).stem)] = 1 if str(row[labcol]).strip().lower() in ('1','true','fake','t','y','yes') else 0
    except Exception as e:
        print("Warning: unable to parse FF++ metadata.csv:", e)

# Now list FF++ videos/folders and map labels
for p in list_videos(FFPP_DIR):
    stem = p.stem
    if stem in ff_meta:
        label = ff_meta[stem]
    else:
        # fallback: try to infer from foldername or parent folder
        parent = str(p.parent).lower()
        if 'fake' in parent:
            label = 1
        elif 'real' in parent:
            label = 0
        else:
            # if cannot determine label, skip (safer)
            print(f"Skipping FF++ entry (no label): {p}")
            continue
    entries.append({"path": str(p), "stem": stem, "label": int(label), "source": "ffpp"})

# Celeb-DF (optional) - we include in entries but you'll not use them for training
for p in list_videos(CELEB_REAL):
    entries.append({"path": str(p), "stem": p.stem, "label": 0, "source": "celeb_real"})
for p in list_videos(CELEB_FAKE):
    entries.append({"path": str(p), "stem": p.stem, "label": 1, "source": "celeb_fake"})

print("Collected entries:", len(entries))
# simple inspect first 6
entries[:6]


In [None]:
# Deterministic shuffle + split
random.seed(SEED)
entries_sorted = sorted(entries, key=lambda x: x['path'])  # stable order before shuffle
random.shuffle(entries_sorted)

# Reserve last RESERVE_COUNT items
if RESERVE_COUNT > 0:
    reserved = entries_sorted[-RESERVE_COUNT:]
    remaining = entries_sorted[:-RESERVE_COUNT]
else:
    reserved = []
    remaining = entries_sorted

n = len(remaining)
n_train = int(n * TRAIN_FRAC)
n_val = int(n * VAL_FRAC)
train = remaining[:n_train]
val = remaining[n_train:n_train+n_val]
test_internal = remaining[n_train+n_val:]

print("Split sizes -> train:", len(train), "val:", len(val), "test_internal:", len(test_internal), "reserved:", len(reserved))


In [None]:
OUT_DIR = Path(OUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

def write_list(items, filename):
    with open(OUT_DIR / filename, "w", encoding="utf-8") as f:
        for it in items:
            f.write(it['path'] + "\n")

write_list(train, "train.txt")
write_list(val, "val.txt")
write_list(test_internal, "test_internal.txt")
write_list(reserved, "reserved_200.txt")

# labels.json: map unique key -> label
labels = {}
for it in (train + val + test_internal + reserved):
    key = it['stem']
    # ensure uniqueness: if duplicate stem, append source tag
    if key in labels:
        key = f"{key}__{it['source']}"
    labels[key] = it['label']

with open(OUT_DIR / "labels.json", "w") as f:
    json.dump(labels, f, indent=2)

# manifest
manifest = {
    "total_videos": len(entries),
    "splits": {
        "train": len(train),
        "val": len(val),
        "test_internal": len(test_internal),
        "reserved": len(reserved)
    }
}
with open(OUT_DIR / "data_manifest.yaml", "w") as f:
    yaml.dump(manifest, f)

print("Wrote files to", OUT_DIR)
print("Sample labels (first 8):")
list(labels.items())[:8]


In [None]:
from collections import Counter
def counts(list_items):
    return Counter([it['label'] for it in list_items])

print("Train label counts:", counts(train))
print("Val label counts:", counts(val))
print("Test_internal label counts:", counts(test_internal))
print("Reserved label counts:", counts(reserved))
print("Total videos:", len(entries))
