# 01 - Simple dataset split
This notebook creates deterministic train/val/test splits and a labels.json mapping.
It expects your preprocessed face-only folders to be present under the project root:
- preprocessed/DFDC_REAL_Face_only_data/
- preprocessed/DFDC_FAKE_Face_only_data/
- preprocessed/FF_Face_only_data/  (optional metadata.csv)
- preprocessed/Celeb_real_face_only/  (optional: not used for training)
- preprocessed/Celeb_fake_face_only/  (optional: not used for training)


In [3]:
from pathlib import Path
import random
import json
import yaml
import pandas as pd   # used only if ffpp metadata present

# --------- USER EDIT ---------
ROOT = Path.cwd().parent
print("ROOT:", ROOT)
PREPROC = ROOT / "Dataset"
print("PREPROC:", PREPROC) # where your preprocessed face-folders live
OUT_DIR = ROOT / "data"         # outputs written here
RESERVE_COUNT = 200             # internal reserved set
SEED = 42                       # deterministic seed
TRAIN_FRAC = 0.80
VAL_FRAC = 0.15
# -----------------------------

# dataset folders (update names only if your folders are different)
DFDC_REAL = PREPROC / "DFDC_REAL_Face_only_data"
DFDC_FAKE = PREPROC / "DFDC_FAKE_Face_only_data"
FFPP_DIR = PREPROC / "FF_Face_only_data"          # optional
FFPP_META = FFPP_DIR / "metadata.csv"             # optional file
#CELEB_REAL = PREPROC / "Celeb_real_face_only"     # optional (held-out)
#CELEB_FAKE = PREPROC / "Celeb_fake_face_only"     # optional (held-out)

OUT_DIR.mkdir(exist_ok=True)


ROOT: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC
PREPROC: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\Dataset


In [4]:
def list_videos(folder: Path):
    """Return list of Path objects. If folder contains subfolders per video, prefer those (use subfolder name)."""
    if not folder.exists():
        return []
    # if subfolders exist, treat each subfolder as one 'video' (preprocessed face folder)
    subdirs = [p for p in folder.iterdir() if p.is_dir()]
    if len(subdirs) > 0:
        # use subdir path as representative; label mapping will use subdir.name as key
        return [p for p in subdirs]
    # else fallback: list files directly
    files = [p for p in folder.glob("*") if p.is_file()]
    return files

entries = []  # each entry: dict { "path": str, "stem": str, "label": 0/1, "source": name }

# DFDC real
for p in list_videos(DFDC_REAL):
    entries.append({"path": str(p), "stem": p.stem, "label": 0, "source": "dfdc_real"})

# DFDC fake
for p in list_videos(DFDC_FAKE):
    entries.append({"path": str(p), "stem": p.stem, "label": 1, "source": "dfdc_fake"})

# FF++ (try metadata first if present, else infer from parent or filename)
ff_meta = {}
if FFPP_META.exists():
    try:
        df = pd.read_csv(FFPP_META)
        # try commonly named columns
        # prefer columns named 'video' and 'label' (if they exist)
        if 'video' in df.columns and 'label' in df.columns:
            for _, row in df.iterrows():
                ff_meta[str(Path(row['video']).stem)] = int(row['label'])
        else:
            # build a flexible mapping: find first two meaningful columns
            # (simple heuristic)
            cols = list(df.columns)
            keycol = cols[0]
            labcol = None
            for c in cols[1:]:
                if any(x in c.lower() for x in ['label','fake','class','manipulated']):
                    labcol = c
                    break
            if labcol is None:
                labcol = cols[1] if len(cols) > 1 else None
            if labcol:
                for _, row in df.iterrows():
                    ff_meta[str(Path(row[keycol]).stem)] = 1 if str(row[labcol]).strip().lower() in ('1','true','fake','t','y','yes') else 0
    except Exception as e:
        print("Warning: unable to parse FF++ metadata.csv:", e)

# Now list FF++ videos/folders and map labels
for p in list_videos(FFPP_DIR):
    stem = p.stem
    if stem in ff_meta:
        label = ff_meta[stem]
    else:
        # fallback: try to infer from foldername or parent folder
        parent = str(p.parent).lower()
        if 'fake' in parent:
            label = 1
        elif 'real' in parent:
            label = 0
        else:
            # if cannot determine label, skip (safer)
            print(f"Skipping FF++ entry (no label): {p}")
            continue
    entries.append({"path": str(p), "stem": stem, "label": int(label), "source": "ffpp"})

'''# Celeb-DF (optional) - we include in entries but you'll not use them for training
for p in list_videos(CELEB_REAL):
    entries.append({"path": str(p), "stem": p.stem, "label": 0, "source": "celeb_real"})
for p in list_videos(CELEB_FAKE):
    entries.append({"path": str(p), "stem": p.stem, "label": 1, "source": "celeb_fake"})
'''
print("Collected entries:", len(entries))
# simple inspect first 6
entries[:6]


Collected entries: 5283


[{'path': 'c:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS Code\\DeepFake_Detection_SIC\\Dataset\\DFDC_REAL_Face_only_data\\aabqyygbaa.mp4',
  'stem': 'aabqyygbaa',
  'label': 0,
  'source': 'dfdc_real'},
 {'path': 'c:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS Code\\DeepFake_Detection_SIC\\Dataset\\DFDC_REAL_Face_only_data\\aajsqyyjni.mp4',
  'stem': 'aajsqyyjni',
  'label': 0,
  'source': 'dfdc_real'},
 {'path': 'c:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS Code\\DeepFake_Detection_SIC\\Dataset\\DFDC_REAL_Face_only_data\\aayfryxljh.mp4',
  'stem': 'aayfryxljh',
  'label': 0,
  'source': 'dfdc_real'},
 {'path': 'c:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS Code\\DeepFake_Detection_SIC\\Dataset\\DFDC_REAL_Face_only_data\\abbgqbrdiz.mp4',
  'stem': 'abbgqbrdiz',
  'label': 0,
  'source': 'dfdc_real'},
 {'path': 'c:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS Code\\DeepFake_Detection_SIC\\Dataset\\DFDC_REAL_Face_only_data\\abmjszfycr.mp4',
  'stem': 'abmjszfycr',
  'label': 0,
 

In [5]:
# Deterministic shuffle + split
random.seed(SEED)
entries_sorted = sorted(entries, key=lambda x: x['path'])  # stable order before shuffle
random.shuffle(entries_sorted)

# Reserve last RESERVE_COUNT items
if RESERVE_COUNT > 0:
    reserved = entries_sorted[-RESERVE_COUNT:]
    remaining = entries_sorted[:-RESERVE_COUNT]
else:
    reserved = []
    remaining = entries_sorted

n = len(remaining)
n_train = int(n * TRAIN_FRAC)
n_val = int(n * VAL_FRAC)
train = remaining[:n_train]
val = remaining[n_train:n_train+n_val]
test_internal = remaining[n_train+n_val:]

print("Split sizes -> train:", len(train), "val:", len(val), "test_internal:", len(test_internal), "reserved:", len(reserved))


Split sizes -> train: 4066 val: 762 test_internal: 255 reserved: 200


In [6]:
OUT_DIR = Path(OUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

def write_list(items, filename):
    with open(OUT_DIR / filename, "w", encoding="utf-8") as f:
        for it in items:
            f.write(it['path'] + "\n")

write_list(train, "train.txt")
write_list(val, "val.txt")
write_list(test_internal, "test_internal.txt")
write_list(reserved, "reserved_200.txt")

# labels.json: map unique key -> label
labels = {}
for it in (train + val + test_internal + reserved):
    key = it['stem']
    # ensure uniqueness: if duplicate stem, append source tag
    if key in labels:
        key = f"{key}__{it['source']}"
    labels[key] = it['label']

with open(OUT_DIR / "labels.json", "w") as f:
    json.dump(labels, f, indent=2)

# manifest
manifest = {
    "total_videos": len(entries),
    "splits": {
        "train": len(train),
        "val": len(val),
        "test_internal": len(test_internal),
        "reserved": len(reserved)
    }
}
with open(OUT_DIR / "data_manifest.yaml", "w") as f:
    yaml.dump(manifest, f)

print("Wrote files to", OUT_DIR)
print("Sample labels (first 8):")
list(labels.items())[:8]


Wrote files to c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\data
Sample labels (first 8):


[('aauhqwwncp', 1),
 ('670_661', 1),
 ('uyfdoedjoj', 0),
 ('dbrpqjttey', 1),
 ('215_208', 1),
 ('846_845', 1),
 ('450_533', 1),
 ('901', 0)]

In [7]:
from collections import Counter
def counts(list_items):
    return Counter([it['label'] for it in list_items])

print("Train label counts:", counts(train))
print("Val label counts:", counts(val))
print("Test_internal label counts:", counts(test_internal))
print("Reserved label counts:", counts(reserved))
print("Total videos:", len(entries))


Train label counts: Counter({0: 2085, 1: 1981})
Val label counts: Counter({0: 404, 1: 358})
Test_internal label counts: Counter({0: 131, 1: 124})
Reserved label counts: Counter({1: 101, 0: 99})
Total videos: 5283


In [None]:
'''# DEBUG: show exactly what entries were collected and whether they are files or dirs
from pathlib import Path
from collections import Counter, defaultdict
import json

# re-use 'entries' from your notebook if available, otherwise rebuild minimal list
if "entries" not in globals():
    print("`entries` not found in notebook namespace. Re-run the collection cell first.")
else:
    print("Total raw entries collected (len(entries)):", len(entries))

    # count by whether path is file or dir (on disk)
    type_counts = Counter()
    per_source_counts = defaultdict(lambda: Counter())
    stem_counts = Counter()
    seen_paths = []
    detailed = []

    for it in entries:
        p = Path(it['path'])
        is_file = p.is_file()
        is_dir  = p.is_dir()
        t = "file" if is_file else ("dir" if is_dir else "missing")
        type_counts[t] += 1
        per_source_counts[it['source']][t] += 1
        stem_counts[it['stem']] += 1
        seen_paths.append(str(p))
        detailed.append({"path": str(p), "exists": p.exists(), "is_file": is_file, "is_dir": is_dir,
                         "stem": it['stem'], "label": it['label'], "source": it['source']})

    print("\nType counts (file/dir/missing):", dict(type_counts))
    print("\nPer-source breakdown (file/dir/missing):")
    for src, cnt in per_source_counts.items():
        print("  ", src, dict(cnt))

    # find stems that appear more than once (possible duplicates)
    dup_stems = [s for s,c in stem_counts.items() if c > 1]
    print("\nNumber of duplicate stems (same stem appears multiple times across entries):", len(dup_stems))
    if len(dup_stems) > 0:
        print("Sample duplicate stems and their entries (first 10):")
        sample = dup_stems[:10]
        for s in sample:
            print("  STEM:", s)
            for it in [d for d in detailed if d['stem']==s]:
                print("    ", it)

    # show first 12 raw detailed entries to inspect
    print("\nFirst 12 detailed entries (inspect):")
    from pprint import pprint
    pprint(detailed[:12])

    # write debug dump to file for inspection
    with open("debug_entries.json","w") as f:
        json.dump(detailed, f, indent=2)
    print("\nWrote debug_entries.json (first 100 entries) for further inspection.")'''


Total raw entries collected (len(entries)): 5283

Type counts (file/dir/missing): {'file': 5283}

Per-source breakdown (file/dir/missing):
   dfdc_real {'file': 1727}
   dfdc_fake {'file': 1566}
   ffpp {'file': 1990}

Number of duplicate stems (same stem appears multiple times across entries): 0

First 12 detailed entries (inspect):
[{'exists': True,
  'is_dir': False,
  'is_file': True,
  'label': 0,
  'path': 'c:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS '
          'Code\\DeepFake_Detection_SIC\\Dataset\\DFDC_REAL_Face_only_data\\aabqyygbaa.mp4',
  'source': 'dfdc_real',
  'stem': 'aabqyygbaa'},
 {'exists': True,
  'is_dir': False,
  'is_file': True,
  'label': 0,
  'path': 'c:\\Users\\lkmah\\OneDrive\\Desktop\\Lokesh\\VS '
          'Code\\DeepFake_Detection_SIC\\Dataset\\DFDC_REAL_Face_only_data\\aajsqyyjni.mp4',
  'source': 'dfdc_real',
  'stem': 'aajsqyyjni'},
 {'exists': True,
  'is_dir': False,
  'is_file': True,
  'label': 0,
  'path': 'c:\\Users\\lkmah\\OneDrive\\Desktop