In [1]:
# Cell 1 — imports & config
import os
from pathlib import Path
import json
import pickle
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split

ROOT = Path.cwd().parent  # adjust if you open notebook elsewhere
CHECKPOINT_DIR = ROOT / "checkpoints" / "ensemble"
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

# Paths to feature CSVs OR where they'll be created
SPATIAL_STATS_CSV = ROOT / "features" / "spatial_summary_per_video.csv"
TEMPORAL_LOGITS_CSV = ROOT / "features" / "temporal_logits_per_video.csv"

# If your project stores everything at different place, edit above paths.
print("ROOT:", ROOT)
print("Spatial CSV:", SPATIAL_STATS_CSV)
print("Temporal CSV:", TEMPORAL_LOGITS_CSV)

ROOT: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC
Spatial CSV: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\features\spatial_summary_per_video.csv
Temporal CSV: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\features\temporal_logits_per_video.csv


In [2]:
# Cell 2 — load the per-video tables (or build them if missing)

def safe_load_csv(p):
    if not p.exists():
        print(f"Missing file: {p}")
        return None
    df = pd.read_csv(p)
    print(f"Loaded {p} shape={df.shape}")
    return df

spatial_df = safe_load_csv(SPATIAL_STATS_CSV)
temporal_df = safe_load_csv(TEMPORAL_LOGITS_CSV)

# If either is None, try to build from available artifacts.
# Example building logic (uncomment and adapt if needed):
#
# If you have a per-video dataframe already in memory or produced earlier,
# save it as CSV at paths above and re-run this cell.
#
# For now we assume the CSVs exist because your runs earlier produced DataFrames.
#
if spatial_df is None or temporal_df is None:
    raise RuntimeError("One of the required input CSVs is missing. Create them or tell me and I will generate them from embeddings/logits.")

Missing file: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\features\spatial_summary_per_video.csv
Missing file: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\features\temporal_logits_per_video.csv


RuntimeError: One of the required input CSVs is missing. Create them or tell me and I will generate them from embeddings/logits.

In [None]:
# Cell 3 — inspect and merge by (split, stem)
display(spatial_df.head())
display(temporal_df.head())

# ensure consistent column names; expected at least:
# spatial_df: columns ['split','stem','label','sp_mean_prob','sp_max_prob','sp_median_prob','sp_std_prob','sp_mean_logit', ...]
# temporal_df: columns ['split','stem','tmp_logit', 'tmp_prob'] or similar

# rename common columns (if needed)
# temporal_df = temporal_df.rename(columns={'tmp_logit': 'tmp_logit', 'tmp_prob': 'tmp_prob'})

# Merge on split + stem
ensemble_df = pd.merge(spatial_df, temporal_df, on=['split', 'stem'], how='inner')
print("Ensemble DF shape:", ensemble_df.shape)
display(ensemble_df.head())

In [None]:
# Cell 4 — prepare X, y for each split
feature_cols = [c for c in ensemble_df.columns if c not in ('split','stem','label')]
print("Features used:", feature_cols)

dfs = {}
for split in ['train', 'val', 'test']:
    dfs[split] = ensemble_df[ensemble_df['split']==split].reset_index(drop=True)
    print(split, "shape:", dfs[split].shape, "label dist:", dfs[split]['label'].value_counts().to_dict())

X_train = dfs['train'][feature_cols].values
y_train = dfs['train']['label'].values
X_val = dfs['val'][feature_cols].values
y_val = dfs['val']['label'].values
X_test = dfs['test'][feature_cols].values
y_test = dfs['test']['label'].values

In [None]:
# Cell 5 — train base logistic regression (L2 default)
base_clf = LogisticRegression(max_iter=2000, solver='lbfgs')
base_clf.fit(X_train, y_train)
# evaluate
def eval_clf(clf, X, y, split_name):
    probs = clf.predict_proba(X)[:,1]
    auc_v = roc_auc_score(y, probs)
    print(f"{split_name} AUC: {auc_v:.4f}")
    return probs, auc_v

print("Base classifier trained.")
bp_train, train_auc_base = eval_clf(base_clf, X_train, y_train, "Train (base)")
bp_val, val_auc_base     = eval_clf(base_clf, X_val, y_val, "Val   (base)")
bp_test, test_auc_base   = eval_clf(base_clf, X_test, y_test, "Test  (base)")

In [None]:
# Cell 6 — calibrate using validation set (Platt)
calibrator = CalibratedClassifierCV(base_clf, method='sigmoid', cv='prefit')
calibrator.fit(X_val, y_val)   # fit on val
print("Calibrator (Platt) fitted on val set.")
cp_train, train_auc_cal = eval_clf(calibrator, X_train, y_train, "Train (calibrated)")
cp_val, val_auc_cal     = eval_clf(calibrator, X_val, y_val, "Val   (calibrated)")
cp_test, test_auc_cal   = eval_clf(calibrator, X_test, y_test, "Test  (calibrated)")

In [None]:
# Cell 7 — plot ROC curves
plt.figure(figsize=(8,6))
fpr, tpr, _ = roc_curve(y_test, bp_test)
plt.plot(fpr, tpr, label=f"Base test AUC={test_auc_base:.4f}")
fpr2, tpr2, _ = roc_curve(y_test, cp_test)
plt.plot(fpr2, tpr2, label=f"Calibrated test AUC={test_auc_cal:.4f}")
plt.plot([0,1],[0,1],'k--', alpha=0.4)
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("Ensemble ROC (Test)")
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.show()

In [None]:
# Cell 8 — persist artifacts
MODEL_BASE_PKL = CHECKPOINT_DIR / "ensemble_base_logreg.pkl"
MODEL_CAL_PKL  = CHECKPOINT_DIR / "ensemble_calibrated.pkl"
PRED_CSV = CHECKPOINT_DIR / "ensemble_per_video_predictions.csv"

with open(MODEL_BASE_PKL, "wb") as f:
    pickle.dump(base_clf, f)
with open(MODEL_CAL_PKL, "wb") as f:
    pickle.dump(calibrator, f)
print("Saved models to:", MODEL_BASE_PKL, MODEL_CAL_PKL)

# save per-video predictions for further analysis / plotting
dfs['train']['pred_base'] = bp_train
dfs['val']['pred_base']   = bp_val
dfs['test']['pred_base']  = bp_test
dfs['train']['pred_cal'] = cp_train
dfs['val']['pred_cal']   = cp_val
dfs['test']['pred_cal']  = cp_test

pd.concat([dfs['train'], dfs['val'], dfs['test']], ignore_index=True).to_csv(PRED_CSV, index=False)
print("Saved per-video predictions to:", PRED_CSV)

In [None]:
# Cell 9 — interpret coefficients (for linear logistic)
coef = base_clf.coef_.ravel()
feat_imp = pd.Series(coef, index=feature_cols).sort_values(key=lambda x: np.abs(x), ascending=False)
display(feat_imp.to_frame("coef"))
# Quick bar plot
feat_imp.abs().sort_values(ascending=True).plot.barh(figsize=(6,6), title="|coef| importance")
plt.show()

In [None]:
# Cell 10 — diagnostics (choose threshold 0.5 for now)
from sklearn.metrics import confusion_matrix, classification_report
preds = (cp_test >= 0.5).astype(int)
print("Confusion matrix (test) at 0.5 threshold")
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))