# 07 - Train Ensemble (Spatial + Temporal)
Uses calibrated logistic regression (Platt scaling)

Outputs final video-level classifier

In [3]:
from pathlib import Path
import json
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import timm

from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
import joblib

In [4]:
# ---------------- CONFIG ----------------
ROOT = Path.cwd().parent

EMB_ROOT = ROOT / "embeddings"                 # embeddings/<split>/<video>.npy
LABELS_JSON = ROOT / "data" / "labels.json"

SPATIAL_CKPT = ROOT / "checkpoints" / "spatial" / "spatial_best_valAUC.pth"
TEMPORAL_CKPT = ROOT / "checkpoints" / "temporal" / "temporal_best_valAUC.pth"

OUT_DIR = ROOT / "checkpoints" / "ensemble"
FEAT_CACHE = ROOT / "ensemble_features"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

OUT_DIR.mkdir(parents=True, exist_ok=True)
FEAT_CACHE.mkdir(parents=True, exist_ok=True)

print("Device:", DEVICE)
print("Spatial ckpt:", SPATIAL_CKPT)
print("Temporal ckpt:", TEMPORAL_CKPT)

Device: cuda
Spatial ckpt: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\checkpoints\spatial\spatial_best_valAUC.pth
Temporal ckpt: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\checkpoints\temporal\temporal_best_valAUC.pth


In [5]:
with open(LABELS_JSON, "r") as f:
    labels_map = json.load(f)

def get_label(stem):
    if stem in labels_map:
        return int(labels_map[stem])
    for k, v in labels_map.items():
        if stem in k:
            return int(v)
    raise KeyError(f"Label not found for {stem}")

In [6]:
class SpatialHead(nn.Module):
    def __init__(self):
        super().__init__()
        backbone = timm.create_model("efficientnet_b3", pretrained=False, num_classes=0)
        self.feat_dim = backbone.num_features
        self.head = nn.Sequential(
            nn.Linear(self.feat_dim, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.4),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        return self.head(x).squeeze(1)

spatial_head = SpatialHead().to(DEVICE)

ck = torch.load(SPATIAL_CKPT, map_location=DEVICE)
state = ck.get("model_state", ck)

# load only head weights
head_state = {k.replace("head.", "head."): v for k, v in state.items() if k.startswith("head.")}
spatial_head.load_state_dict(head_state, strict=False)
spatial_head.eval()

print("Spatial head loaded. feat_dim =", spatial_head.feat_dim)

Spatial head loaded. feat_dim = 1536


In [7]:
class TemporalModel(nn.Module):
    def __init__(self, feat_dim):
        super().__init__()
        self.lstm = nn.LSTM(
            feat_dim, 512, 2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        self.att = nn.Linear(1024, 1)
        self.head = nn.Sequential(
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def forward(self, x, lengths):
        out, _ = self.lstm(x)
        T = out.size(1)
        mask = torch.arange(T, device=x.device)[None, :] >= lengths[:, None]
        scores = self.att(out).squeeze(-1).masked_fill(mask, -1e9)
        w = torch.softmax(scores, dim=1)
        pooled = (out * w.unsqueeze(-1)).sum(dim=1)
        return self.head(pooled).squeeze(1)

# infer feat_dim from embeddings
sample = np.load(next((EMB_ROOT / "train").glob("*.npy")))
FEAT_DIM = sample.shape[1]

temporal_model = TemporalModel(FEAT_DIM).to(DEVICE)
ck = torch.load(TEMPORAL_CKPT, map_location=DEVICE)
temporal_model.load_state_dict(ck["model_state"], strict=False)
temporal_model.eval()

print("Temporal model loaded.")

Temporal model loaded.


In [8]:
def spatial_stats_from_embeddings(emb):
    emb_t = torch.from_numpy(emb).float().to(DEVICE)
    with torch.no_grad():
        probs = torch.sigmoid(spatial_head(emb_t)).cpu().numpy()

    return {
        "mean": probs.mean(),
        "max": probs.max(),
        "std": probs.std(),
        "top3": np.sort(probs)[-3:].mean() if len(probs) >= 3 else probs.mean()
    }

def temporal_score_from_embeddings(emb):
    x = torch.from_numpy(emb).float().unsqueeze(0).to(DEVICE)
    lengths = torch.tensor([emb.shape[0]], device=DEVICE)
    with torch.no_grad():
        prob = torch.sigmoid(temporal_model(x, lengths)).item()
    return prob

In [9]:
def build_features(split, overwrite=False):
    cache = FEAT_CACHE / f"{split}.npz"
    if cache.exists() and not overwrite:
        print(f"Loaded cached features for {split}")
        d = np.load(cache, allow_pickle=True)
        return d["X"], d["y"]

    X, y = [], []
    files = sorted((EMB_ROOT / split).glob("*.npy"))

    print(f"Building features for {split} ({len(files)} videos)")
    for p in tqdm(files):
        emb = np.load(p)
        if emb.shape[0] == 0:
            continue

        s = spatial_stats_from_embeddings(emb)
        t = temporal_score_from_embeddings(emb)

        feat = [
            s["mean"], s["max"], s["std"], s["top3"], t
        ]

        X.append(feat)
        y.append(get_label(p.stem))

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.int64)

    np.savez(cache, X=X, y=y)
    print("Saved cache:", cache)

    return X, y

X_train, y_train = build_features("train")
X_val, y_val     = build_features("val")
X_test, y_test   = build_features("test")

print("Feature shape:", X_train.shape)

Building features for train (4066 videos)


100%|██████████| 4066/4066 [00:06<00:00, 640.80it/s]


Saved cache: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\ensemble_features\train.npz
Building features for val (761 videos)


100%|██████████| 761/761 [00:01<00:00, 702.66it/s]


Saved cache: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\ensemble_features\val.npz
Building features for test (255 videos)


100%|██████████| 255/255 [00:00<00:00, 671.75it/s]

Saved cache: c:\Users\lkmah\OneDrive\Desktop\Lokesh\VS Code\DeepFake_Detection_SIC\ensemble_features\test.npz
Feature shape: (4066, 5)





In [10]:
base_lr = LogisticRegression(max_iter=1000)
ensemble = CalibratedClassifierCV(base_lr, cv=3, method="sigmoid")

ensemble.fit(X_train, y_train)
print("Ensemble trained.")

Ensemble trained.


In [11]:
def eval_auc(X, y, name):
    p = ensemble.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, p)
    print(f"{name} AUC: {auc:.4f}")
    return auc

train_auc = eval_auc(X_train, y_train, "Train")
val_auc   = eval_auc(X_val, y_val, "Val")
test_auc  = eval_auc(X_test, y_test, "Test")

Train AUC: 1.0000
Val AUC: 0.9916
Test AUC: 0.9900


In [12]:
joblib.dump(ensemble, OUT_DIR / "ensemble_best.pkl")

with open(OUT_DIR / "ensemble_results.txt", "w") as f:
    f.write(f"Train AUC: {train_auc:.4f}\n")
    f.write(f"Val AUC:   {val_auc:.4f}\n")
    f.write(f"Test AUC:  {test_auc:.4f}\n")

print("Saved ensemble model and report.")

Saved ensemble model and report.
