# Cross-Dataset Evaluation: LAV-DF

Evaluate FakeAVCeleb-trained model on LAV-DF dataset (NO retraining)

**Label Convention:**
- `1 = REAL`
- `0 = FAKE`

In [None]:
# CELL 1: SETUP & IMPORTS

import os
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import librosa
import timm
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from tqdm.auto import tqdm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc
)
import torchvision.transforms as transforms
import warnings
import subprocess
import json

warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
print(f"PyTorch: {torch.__version__}")

Device: cuda
PyTorch: 2.9.0+cu126


In [None]:
# CELL 2: MOUNT DRIVE & SET MODEL PATHS

from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = "/content/drive/MyDrive/deepfake_checkpoints/Multi Modal"
AUDIO_MODEL_PATH = os.path.join(PROJECT_DIR, "best_audio_xception.pth")
VIDEO_MODEL_PATH = os.path.join(PROJECT_DIR, "epoch_007.pt")

print(f"Project dir: {PROJECT_DIR}")
print(f"Audio model: {'FOUND' if os.path.exists(AUDIO_MODEL_PATH) else 'NOT FOUND'}")
print(f"Video model: {'FOUND' if os.path.exists(VIDEO_MODEL_PATH) else 'NOT FOUND'}")

Mounted at /content/drive
Project dir: /content/drive/MyDrive/deepfake_checkpoints/Multi Modal
Audio model: FOUND
Video model: FOUND


In [None]:
# CELL 3: DOWNLOAD LAV-DF DATASET

import kagglehub

print("="*60)
print(" DOWNLOADING LAV-DF DATASET")
print("="*60)

DATASET = "elin75/localized-audio-visual-deepfake-dataset-lav-df"
lavdf_download_path = kagglehub.dataset_download(DATASET)
print(f"Downloaded to: {lavdf_download_path}")

# Find LAV-DF folder
lavdf_path = os.path.join(lavdf_download_path, "LAV-DF")
if not os.path.exists(lavdf_path):
    lavdf_path = lavdf_download_path

print(f"LAV-DF path: {lavdf_path}")

# Show structure
print("\nDataset structure:")
for item in os.listdir(lavdf_path):
    item_path = os.path.join(lavdf_path, item)
    if os.path.isdir(item_path):
        n_files = len(os.listdir(item_path))
        print(f"  {item}/ ({n_files} files)")
    else:
        size_mb = os.path.getsize(item_path) / (1024*1024)
        print(f"  {item} ({size_mb:.2f} MB)")

 DOWNLOADING LAV-DF DATASET
Using Colab cache for faster access to the 'localized-audio-visual-deepfake-dataset-lav-df' dataset.
Downloaded to: /kaggle/input/localized-audio-visual-deepfake-dataset-lav-df
LAV-DF path: /kaggle/input/localized-audio-visual-deepfake-dataset-lav-df/LAV-DF

Dataset structure:
  README.md (0.00 MB)
  metadata.json (131.19 MB)
  metadata.min.json (32.27 MB)
  test/ (26100 files)
  train/ (78703 files)
  dev/ (31501 files)


In [None]:
# CELL 4: LOAD METADATA

meta_path = os.path.join(lavdf_path, "metadata.json")
print(f"Loading metadata from: {meta_path}")

with open(meta_path, "r") as f:
    metadata = json.load(f)

print(f"Total entries: {len(metadata)}")
print(f"Type: {type(metadata)}")

# Show sample entry
print("\nSample entry:")
print(json.dumps(metadata[0], indent=2))

Loading metadata from: /kaggle/input/localized-audio-visual-deepfake-dataset-lav-df/LAV-DF/metadata.json
Total entries: 136304
Type: <class 'list'>

Sample entry:
{
  "file": "test/000001.mp4",
  "n_fakes": 0,
  "fake_periods": [],
  "timestamps": [
    [
      "not",
      0.0,
      0.2
    ],
    [
      "the",
      0.2,
      0.4
    ],
    [
      "point",
      0.4,
      0.8
    ],
    [
      "the",
      0.8,
      1.7
    ],
    [
      "point",
      1.7,
      1.7
    ],
    [
      "is",
      1.7,
      2.1
    ],
    [
      "that",
      2.1,
      2.3
    ],
    [
      "she",
      2.3,
      2.5
    ],
    [
      "died",
      2.5,
      2.7
    ],
    [
      "satisfied",
      2.7,
      3.6
    ],
    [
      "with",
      3.6,
      3.8
    ],
    [
      "the",
      3.8,
      3.8
    ],
    [
      "life",
      3.8,
      4.0
    ]
  ],
  "duration": 4.224,
  "transcript": "not the point the point is that she died satisfied with the life",
  "original": nul

In [None]:
# CELL 5: CREATE TEST SET
# Label convention: 1 = REAL, 0 = FAKE (sama dengan FakeAVCeleb)

print("="*60)
print(" CREATING LAV-DF TEST SET")
print("="*60)

rows = []
for item in metadata:
    if item["split"] == "test":
        # Check if fake
        is_fake = (item["n_fakes"] > 0) or item["modify_video"] or item["modify_audio"]

        # Label: 1 = REAL, 0 = FAKE (SAMA dengan FakeAVCeleb!)
        label = 0 if is_fake else 1

        # Modify type for analysis
        if not is_fake:
            modify_type = "real"
        elif item["modify_video"] and item["modify_audio"]:
            modify_type = "both_modified"
        elif item["modify_video"]:
            modify_type = "visual_modified"
        elif item["modify_audio"]:
            modify_type = "audio_modified"
        else:
            modify_type = "fake_segments"

        video_path = os.path.join(lavdf_path, item["file"])

        if os.path.exists(video_path):
            rows.append({
                "path": video_path,
                "label": label,
                "modify_type": modify_type,
            })

lavdf_test_df = pd.DataFrame(rows)

print(f"\nTest set statistics:")
print(f"  Total videos: {len(lavdf_test_df)}")
print(f"  REAL (label=1): {(lavdf_test_df['label'] == 1).sum()}")
print(f"  FAKE (label=0): {(lavdf_test_df['label'] == 0).sum()}")

print(f"\nBy modify_type:")
print(lavdf_test_df['modify_type'].value_counts())

 CREATING LAV-DF TEST SET

Test set statistics:
  Total videos: 26100
  REAL (label=1): 6906
  FAKE (label=0): 19194

By modify_type:
modify_type
real               6906
visual_modified    6452
audio_modified     6373
both_modified      6369
Name: count, dtype: int64


In [None]:
# CELL 6: OPTIONAL SUBSAMPLING UNTUK CEPAT

MAX_SAMPLES = 150  # set ke None kalau mau semua

if MAX_SAMPLES and len(lavdf_test_df) > MAX_SAMPLES:
    lavdf_test_df = lavdf_test_df.groupby('label', group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), MAX_SAMPLES//2), random_state=42)
    ).reset_index(drop=True)

print(f"\nFinal test set: {len(lavdf_test_df)} videos")
print(f" REAL: {(lavdf_test_df['label'] == 1).sum()}")
print(f" FAKE: {(lavdf_test_df['label'] == 0).sum()}")

# SUBSET PER MODIFY_TYPE
real_df   = lavdf_test_df[lavdf_test_df["modify_type"] == "real"].copy()
audio_df  = lavdf_test_df[lavdf_test_df["modify_type"] == "audio_modified"].copy()
video_df  = lavdf_test_df[lavdf_test_df["modify_type"] == "visual_modified"].copy()
both_df   = lavdf_test_df[lavdf_test_df["modify_type"] == "both_modified"].copy()
seg_df    = lavdf_test_df[lavdf_test_df["modify_type"] == "fake_segments"].copy()

print("\nSubset sizes:")
print(" REAL          :", len(real_df))
print(" AUDIO_MOD     :", len(audio_df))
print(" VISUAL_MOD    :", len(video_df))
print(" BOTH_MOD      :", len(both_df))
print(" FAKE_SEGMENTS :", len(seg_df))



Final test set: 150 videos
 REAL: 75
 FAKE: 75

Subset sizes:
 REAL          : 75
 AUDIO_MOD     : 24
 VISUAL_MOD    : 32
 BOTH_MOD      : 19
 FAKE_SEGMENTS : 0


In [None]:
# CELL 7: MODEL ARCHITECTURE

class AudioXceptionClassifier(nn.Module):
    def __init__(self, num_classes=2, dropout=0.5):
        super().__init__()
        self.base = timm.create_model('xception', pretrained=False, num_classes=0)
        self.classifier = nn.Sequential(
            nn.Linear(2048, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        features = self.base(x)
        return self.classifier(features)


class VideoXceptionModel(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.model = timm.create_model('xception', pretrained=False, num_classes=num_classes)

    def forward(self, x):
        return self.model(x)


def load_checkpoint_auto(model, ckpt_path, is_video_model=False):
    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
    state = ckpt["model_state_dict"] if isinstance(ckpt, dict) and "model_state_dict" in ckpt else ckpt

    if is_video_model:
        new_state = {}
        for k, v in state.items():
            new_key = "model." + k if not k.startswith("model.") else k
            new_state[new_key] = v
        model.load_state_dict(new_state, strict=False)
    else:
        new_state = {}
        for k, v in state.items():
            new_key = k.replace("backbone.", "base.") if k.startswith("backbone.") else k
            new_state[new_key] = v
        model.load_state_dict(new_state, strict=False)

    return ckpt


# Load models
print("Loading Audio model...")
audio_model = AudioXceptionClassifier(num_classes=2).to(device)
audio_ckpt = load_checkpoint_auto(audio_model, AUDIO_MODEL_PATH, is_video_model=False)
audio_model.eval()
print(f"  Loaded! Epoch: {audio_ckpt.get('epoch', 'N/A')}")

print("\nLoading Video model...")
video_model = VideoXceptionModel(num_classes=2).to(device)
video_ckpt = load_checkpoint_auto(video_model, VIDEO_MODEL_PATH, is_video_model=True)
video_model.eval()
print(f"  Loaded! Epoch: {video_ckpt.get('epoch', 'N/A')}")

print("\nAll models ready!")

Loading Audio model...
  Loaded! Epoch: 2

Loading Video model...
  Loaded! Epoch: 7

All models ready!


In [None]:
# CELL 8: PREPROCESSING (AUDIO & FACE)

import urllib.request

# Download face detection models (OpenCV)
if not os.path.exists('haarcascade_frontalface_default.xml'):
    url = 'https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml'
    urllib.request.urlretrieve(url, 'haarcascade_frontalface_default.xml')

if not os.path.exists('deploy.prototxt'):
    url = 'https://raw.githubusercontent.com/opencv/opencv/master/samples/dnn/face_detector/deploy.prototxt'
    urllib.request.urlretrieve(url, 'deploy.prototxt')

if not os.path.exists('res10_300x300_ssd_iter_140000.caffemodel'):
    url = 'https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20170830/res10_300x300_ssd_iter_140000.caffemodel'
    urllib.request.urlretrieve(url, 'res10_300x300_ssd_iter_140000.caffemodel')

haar_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
dnn_net = cv2.dnn.readNetFromCaffe('deploy.prototxt', 'res10_300x300_ssd_iter_140000.caffemodel')

video_transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])

def extract_audio_from_video(video_path, output_path="temp_audio.wav", sr=16000):
    try:
        cmd = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le',
               '-ar', str(sr), '-ac', '1', output_path, '-y']
        result = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=60)
        if result.returncode == 0 and os.path.exists(output_path):
            return output_path
    except Exception:
        pass
    return None


def preprocess_audio(audio_path, sr=16000, window_size=2.0, overlap=0.5):
    try:
        y, sr = librosa.load(audio_path, sr=sr)
    except:
        return []

    win_len = int(window_size * sr)
    hop_len = int(win_len * (1 - overlap))

    mel_spectrograms = []

    for start in range(0, len(y), hop_len):
        end = start + win_len
        y_slice = y[start:end]

        if len(y_slice) < int(0.8 * win_len):
            break

        if len(y_slice) < win_len:
            y_slice = np.pad(y_slice, (0, win_len - len(y_slice)))

        mel = librosa.feature.melspectrogram(
            y=y_slice, sr=sr,
            n_fft=2048, hop_length=512,
            n_mels=128, fmin=20, fmax=8000
        )
        mel_db = librosa.power_to_db(mel, ref=1.0)
        mel_db = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-8)

        mel_resized = F.interpolate(
            torch.from_numpy(mel_db).unsqueeze(0).unsqueeze(0).float(),
            size=(299, 299), mode='bilinear', align_corners=False
        ).squeeze()

        mel_rgb = mel_resized.repeat(3, 1, 1)
        mel_spectrograms.append(mel_rgb)

    return mel_spectrograms


def detect_face_hybrid(frame, min_size=80):
    h, w = frame.shape[:2]
    blob = cv2.dnn.blobFromImage(
        cv2.resize(frame, (300, 300)), 1.0,
        (300, 300), (104.0, 177.0, 123.0)
    )
    dnn_net.setInput(blob)
    detections = dnn_net.forward()

    best_face = None
    max_conf = 0

    for i in range(detections.shape[2]):
        confidence = detections[0, 0, i, 2]
        if confidence > 0.3:
            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
            (x, y, x2, y2) = box.astype("int")
            x, y = max(0, x), max(0, y)
            x2, y2 = min(w, x2), min(h, y2)
            face_w, face_h = x2 - x, y2 - y

            if face_w >= min_size and face_h >= min_size and confidence > max_conf:
                best_face = (x, y, face_w, face_h)
                max_conf = confidence

    if best_face is None:
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = haar_cascade.detectMultiScale(
            gray, scaleFactor=1.05, minNeighbors=3,
            minSize=(min_size, min_size)
        )
        if len(faces) > 0:
            faces = sorted(faces, key=lambda x: x[2]*x[3], reverse=True)
            x, y, w2, h2 = faces[0]
            best_face = (x, y, w2, h2)

    return best_face

print("Preprocessing functions ready")


Preprocessing functions ready


In [None]:
# CELL 9: FUSION CONFIG & PROCESS_VIDEO

FUSION_STRATEGY   = 'adaptive'
FUSION_TAU        = 3.0      # lebih sensitif ke confidence
AUDIO_TEMPERATURE = 2.5      # mengecilkan pengaruh audio
VIDEO_WEIGHT      = 0.85     # jika pakai 'video_trust'

print("""
============================================================
 FUSION CONFIGURATION
============================================================
Strategy: {}
Fusion Tau: {}
Audio Temperature: {}
Video Weight: {}
============================================================
""".format(FUSION_STRATEGY, FUSION_TAU, AUDIO_TEMPERATURE, VIDEO_WEIGHT))


def compute_adaptive_weights(video_logits, audio_logits, tau=2.0):
    video_probs = F.softmax(video_logits, dim=1)
    audio_probs = F.softmax(audio_logits, dim=1)

    r_video = torch.max(video_probs, dim=1)[0]
    r_audio = torch.max(audio_probs, dim=1)[0]

    exp_video = torch.exp(tau * r_video)
    exp_audio = torch.exp(tau * r_audio)

    alpha_video = exp_video / (exp_video + exp_audio)
    alpha_audio = exp_audio / (exp_video + exp_audio)

    return alpha_video, alpha_audio


def fuse_predictions(video_logits, audio_logits,
                     strategy='adaptive', tau=2.0, video_weight=0.8):
    if strategy == 'equal':
        alpha_video = 0.5
        alpha_audio = 0.5
        fused_logits = alpha_video * video_logits + alpha_audio * audio_logits

    elif strategy == 'video_trust':
        alpha_video = video_weight
        alpha_audio = 1.0 - video_weight
        fused_logits = alpha_video * video_logits + alpha_audio * audio_logits

    elif strategy == 'adaptive':
        alpha_video, alpha_audio = compute_adaptive_weights(video_logits, audio_logits, tau)
        alpha_video_expanded = alpha_video.unsqueeze(1)
        alpha_audio_expanded = alpha_audio.unsqueeze(1)
        fused_logits = alpha_video_expanded * video_logits + alpha_audio_expanded * audio_logits
        alpha_video = alpha_video.mean().item()
        alpha_audio = alpha_audio.mean().item()
    else:
        raise ValueError(f"Unknown fusion strategy: {strategy}")

    fused_probs = F.softmax(fused_logits, dim=1)
    return fused_probs, {'alpha_video': alpha_video, 'alpha_audio': alpha_audio}


def process_video(video_path,
                  frame_skip=5,
                  max_frames=80,
                  audio_temperature=AUDIO_TEMPERATURE,
                  fusion_strategy=FUSION_STRATEGY,
                  fusion_tau=FUSION_TAU,
                  video_weight=VIDEO_WEIGHT):

    audio_path = extract_audio_from_video(video_path)
    audio_mels = []
    if audio_path and os.path.exists(audio_path):
        audio_mels = preprocess_audio(audio_path, overlap=0.5)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None

    video_logits_list = []
    audio_logits_list = []

    frame_count = 0
    faces_found = 0

    with torch.no_grad():
        while cap.isOpened() and frame_count < max_frames:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_skip == 0:
                face_result = detect_face_hybrid(frame)

                if face_result is not None:
                    x, y, w, h = face_result
                    face_img = frame[y:y+h, x:x+w]
                    face_pil = Image.fromarray(cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB))
                    face_tensor = video_transform(face_pil).unsqueeze(0).to(device)

                    # Video prediction
                    video_logits = video_model(face_tensor)
                    video_logits_list.append(video_logits.cpu())

                    # Audio prediction
                    if len(audio_mels) > 0:
                        audio_idx = faces_found % len(audio_mels)
                        audio_tensor = audio_mels[audio_idx].unsqueeze(0).to(device)
                        audio_logits_raw = audio_model(audio_tensor)
                        audio_logits = audio_logits_raw / audio_temperature

                        # flip logits supaya label sama (0 = real, 1 = fake di output)
                        audio_logits_flipped = torch.flip(audio_logits, dims=[1])
                        audio_logits_list.append(audio_logits_flipped.cpu())

                    faces_found += 1

            frame_count += 1

    cap.release()

    if audio_path and os.path.exists(audio_path):
        os.remove(audio_path)

    if faces_found == 0 or len(video_logits_list) == 0:
        return None

    # VIDEO: index 1 dianggap FAKE -> prob REAL = 1 - prob_fake
    video_logits_all = torch.cat(video_logits_list, dim=0)
    video_probs = F.softmax(video_logits_all, dim=1).mean(dim=0)
    video_fake_prob = video_probs[1].item()
    video_real_prob = 1.0 - video_fake_prob
    video_pred = 1 if video_real_prob > 0.5 else 0
    video_confidence = max(video_real_prob, 1.0 - video_real_prob)

    # AUDIO + FUSION
    if len(audio_logits_list) > 0:
        audio_logits_all = torch.cat(audio_logits_list, dim=0)
        audio_probs = F.softmax(audio_logits_all, dim=1).mean(dim=0)

        audio_fake_prob = audio_probs[1].item()
        audio_real_prob = 1.0 - audio_fake_prob
        audio_pred = 1 if audio_real_prob > 0.5 else 0
        audio_confidence = max(audio_real_prob, 1.0 - audio_real_prob)

        video_logits_mean = video_logits_all.mean(dim=0, keepdim=True)
        audio_logits_mean = audio_logits_all.mean(dim=0, keepdim=True)

        fused_probs, fusion_weights = fuse_predictions(
            video_logits_mean,
            audio_logits_mean,
            strategy=fusion_strategy,
            tau=fusion_tau,
            video_weight=video_weight
        )

        fused_probs = fused_probs.squeeze(0)
        fused_fake_prob = fused_probs[1].item()
        fused_real_prob = 1.0 - fused_fake_prob
        fused_pred = 1 if fused_real_prob > 0.5 else 0
    else:
        audio_probs = None
        audio_real_prob = None
        audio_pred = None
        audio_confidence = None
        fused_real_prob = video_real_prob
        fused_pred = video_pred
        fusion_weights = {'alpha_video': 1.0, 'alpha_audio': 0.0}

    return {
        'video_pred': video_pred,
        'video_real_prob': video_real_prob,
        'video_confidence': video_confidence,
        'audio_pred': audio_pred,
        'audio_real_prob': audio_real_prob,
        'audio_confidence': audio_confidence,
        'fused_pred': fused_pred,
        'fused_real_prob': fused_real_prob,
        'fusion_weights': fusion_weights,
        'n_frames': faces_found
    }

print("Video processing function ready")



 FUSION CONFIGURATION
Strategy: adaptive
Fusion Tau: 3.0
Audio Temperature: 2.5
Video Weight: 0.85

Video processing function ready


In [None]:
# CELL 10: EVALUATION LOOP (GLOBAL LAV-DF TEST)

print("Running cross-dataset evaluation on LAV-DF (full test set)...")

results = []

for idx, row in tqdm(lavdf_test_df.iterrows(), total=len(lavdf_test_df), desc="LAV-DF ALL"):
    video_path = row["path"]
    ground_truth = row["label"]

    if not os.path.exists(video_path):
        continue

    try:
        out = process_video(
            video_path,
            frame_skip=5,
            max_frames=80,
            fusion_strategy=FUSION_STRATEGY,
            fusion_tau=FUSION_TAU,
            video_weight=VIDEO_WEIGHT,
        )
        if out is not None:
            out["ground_truth"] = ground_truth
            out["modify_type"] = row["modify_type"]
            out["video_path"] = video_path
            results.append(out)
    except Exception:
        continue

print(f"\nProcessed {len(results)} / {len(lavdf_test_df)} videos")
print(f" REAL: {sum(1 for r in results if r['ground_truth'] == 1)}")
print(f" FAKE: {sum(1 for r in results if r['ground_truth'] == 0)}")


Running cross-dataset evaluation on LAV-DF (full test set)...


LAV-DF ALL:   0%|          | 0/150 [00:00<?, ?it/s]


Processed 150 / 150 videos
 REAL: 75
 FAKE: 75


In [None]:
# CELL 11: METRICS (GLOBAL)

results_df = pd.DataFrame(results)

y_true = results_df["ground_truth"].values

# VIDEO-ONLY
y_pred_video = results_df["video_pred"].values
y_prob_video = results_df["video_real_prob"].values

video_acc = accuracy_score(y_true, y_pred_video)
video_prec = precision_score(y_true, y_pred_video, zero_division=0)
video_rec = recall_score(y_true, y_pred_video, zero_division=0)
video_f1 = f1_score(y_true, y_pred_video, zero_division=0)
fpr_v, tpr_v, _ = roc_curve(y_true, y_prob_video)
video_auc = auc(fpr_v, tpr_v)

# AUDIO-ONLY
# AUDIO-ONLY
audio_results = results_df[results_df['audio_pred'].notna()]
if len(audio_results) > 0:
    y_true_audio = audio_results['ground_truth'].values
    p_real = audio_results['audio_real_prob'].values  # prob REAL

    from sklearn.metrics import roc_curve, auc

    # ROC + AUC
    fpr_a, tpr_a, thr_a = roc_curve(y_true_audio, p_real)
    audio_auc = auc(fpr_a, tpr_a)

    # Cari threshold terbaik (Youden index)
    youden = tpr_a - fpr_a
    best_idx = np.argmax(youden)
    best_thr = thr_a[best_idx]

    # Pakai threshold terbaik ini
    y_pred_audio = (p_real > best_thr).astype(int)

    audio_acc  = accuracy_score(y_true_audio, y_pred_audio)
    audio_prec = precision_score(y_true_audio, y_pred_audio, zero_division=0)
    audio_rec  = recall_score(y_true_audio, y_pred_audio, zero_division=0)
    audio_f1   = f1_score(y_true_audio, y_pred_audio, zero_division=0)

    print(f"\n[Audio] Best threshold: {best_thr:.3f}")
else:
    audio_acc = audio_prec = audio_rec = audio_f1 = audio_auc = 0.0

# FUSED
y_pred_fused = results_df["fused_pred"].values
y_prob_fused = results_df["fused_real_prob"].values

fused_acc = accuracy_score(y_true, y_pred_fused)
fused_prec = precision_score(y_true, y_pred_fused, zero_division=0)
fused_rec = recall_score(y_true, y_pred_fused, zero_division=0)
fused_f1 = f1_score(y_true, y_pred_fused, zero_division=0)
fpr_f, tpr_f, _ = roc_curve(y_true, y_prob_fused)
fused_auc = auc(fpr_f, tpr_f)

avg_alpha_v = np.mean([r["fusion_weights"]["alpha_video"] for r in results])
avg_alpha_a = np.mean([r["fusion_weights"]["alpha_audio"] for r in results])

print("\n" + "="*70)
print(" CROSS-DATASET RESULTS: LAV-DF (GLOBAL)")
print("="*70 + "\n")

print(f"{'Method':<25} {'Accuracy':>10} {'Precision':>10} {'Recall':>10} {'F1':>10} {'AUC':>10}")
print("-"*75)
print(f"{'Video-Only':<25} {video_acc*100:>9.2f}% {video_prec*100:>9.2f}% {video_rec*100:>9.2f}% {video_f1*100:>9.2f}% {video_auc:>10.4f}")
print(f"{'Audio-Only':<25} {audio_acc*100:>9.2f}% {audio_prec*100:>9.2f}% {audio_rec*100:>9.2f}% {audio_f1*100:>9.2f}% {audio_auc:>10.4f}")
print(f"{'Adaptive Fusion':<25} {fused_acc*100:>9.2f}% {fused_prec*100:>9.2f}% {fused_rec*100:>9.2f}% {fused_f1*100:>9.2f}% {fused_auc:>10.4f}")
print("-"*75)
print(f"\nAverage Fusion Weights: alpha_v = {avg_alpha_v:.4f}, alpha_a = {avg_alpha_a:.4f}")



[Audio] Best threshold: 0.081

 CROSS-DATASET RESULTS: LAV-DF (GLOBAL)

Method                      Accuracy  Precision     Recall         F1        AUC
---------------------------------------------------------------------------
Video-Only                    50.67%     50.34%    100.00%     66.96%     0.6837
Audio-Only                    51.33%     51.35%     50.67%     51.01%     0.4700
Adaptive Fusion               56.00%     53.24%     98.67%     69.16%     0.6850
---------------------------------------------------------------------------

Average Fusion Weights: alpha_v = 0.5544, alpha_a = 0.4456


In [None]:
# CELL X: COMPARE FUSION STRATEGIES ON LAV-DF

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

STRATEGIES = [
    {"name": "Video-Only",          "strategy": "video_trust", "tau": 1.0, "video_weight": 1.0},
    {"name": "Audio-Only",          "strategy": "video_trust", "tau": 1.0, "video_weight": 0.0},
    {"name": "Equal (50/50)",       "strategy": "equal",       "tau": 1.0, "video_weight": 0.5},
    {"name": "Video-Trust (80/20)", "strategy": "video_trust", "tau": 1.0, "video_weight": 0.8},
    {"name": "Adaptive (τ=2.0)",    "strategy": "adaptive",    "tau": 2.0, "video_weight": 0.5},
    {"name": "Adaptive (τ=3.0)",    "strategy": "adaptive",    "tau": 3.0, "video_weight": 0.5},
]

print("="*70)
print(" COMPARING FUSION STRATEGIES ON LAV-DF")
print("="*70)

all_strategy_results = {}

for cfg in STRATEGIES:
    name = cfg["name"]
    print(f"\nTesting: {name}")

    strat_results = []

    for _, row in tqdm(lavdf_test_df.iterrows(), total=len(lavdf_test_df), desc=name):
        video_path = row["path"]
        gt = row["label"]

        if not os.path.exists(video_path):
            continue

        out = process_video(
            video_path,
            frame_skip=5,
            max_frames=80,
            fusion_strategy=cfg["strategy"],
            fusion_tau=cfg["tau"],
            video_weight=cfg["video_weight"],
        )
        if out is None:
            continue

        out["ground_truth"] = gt
        strat_results.append(out)

    if not strat_results:
        print(f"{name}: no valid results")
        continue

    df_s = pd.DataFrame(strat_results)
    y_true = df_s["ground_truth"].values
    y_pred = df_s["fused_pred"].values
    y_prob = df_s["fused_real_prob"].values

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    auc_score = auc(fpr, tpr)

    avg_v = np.mean([r["fusion_weights"]["alpha_video"] for r in strat_results])
    avg_a = np.mean([r["fusion_weights"]["alpha_audio"] for r in strat_results])

    all_strategy_results[name] = {
        "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc_score,
        "avg_video_weight": avg_v, "avg_audio_weight": avg_a,
    }

    print(f" Accuracy: {acc*100:.2f}% | F1: {f1*100:.2f}% | AUC: {auc_score:.4f}")


print("\n" + "="*70)
print(" FUSION STRATEGY COMPARISON - LAV-DF")
print("="*70)
print()
print(f"{'Strategy':<25} {'Accuracy':>10} {'Precision':>10} {'Recall':>10} {'F1':>10} {'AUC':>10}")
print("-"*75)

for name, m in all_strategy_results.items():
    print(f"{name:<25} {m['accuracy']*100:>9.2f}% {m['precision']*100:>9.2f}% "
          f"{m['recall']*100:>9.2f}% {m['f1']*100:>9.2f}% {m['auc']:>10.4f}")

best = max(all_strategy_results.items(), key=lambda x: x[1]["accuracy"])
print("-"*75)
print(f"\nBEST on LAV-DF: {best[0]} with {best[1]['accuracy']*100:.2f}% accuracy")


 COMPARING FUSION STRATEGIES ON LAV-DF

Testing: Video-Only


Video-Only:   0%|          | 0/150 [00:00<?, ?it/s]

 Accuracy: 50.67% | F1: 66.96% | AUC: 0.6828

Testing: Audio-Only


Audio-Only:   0%|          | 0/150 [00:00<?, ?it/s]

 Accuracy: 50.00% | F1: 0.00% | AUC: 0.4898

Testing: Equal (50/50)


Equal (50/50):   0%|          | 0/150 [00:00<?, ?it/s]

 Accuracy: 56.00% | F1: 69.16% | AUC: 0.6869

Testing: Video-Trust (80/20)


Video-Trust (80/20):   0%|          | 0/150 [00:00<?, ?it/s]

 Accuracy: 51.33% | F1: 67.26% | AUC: 0.6857

Testing: Adaptive (τ=2.0)


Adaptive (τ=2.0):   0%|          | 0/150 [00:00<?, ?it/s]

 Accuracy: 56.00% | F1: 69.16% | AUC: 0.6884

Testing: Adaptive (τ=3.0)


Adaptive (τ=3.0):   0%|          | 0/150 [00:00<?, ?it/s]

 Accuracy: 56.00% | F1: 69.16% | AUC: 0.6850

 FUSION STRATEGY COMPARISON - LAV-DF

Strategy                    Accuracy  Precision     Recall         F1        AUC
---------------------------------------------------------------------------
Video-Only                    50.67%     50.34%    100.00%     66.96%     0.6828
Audio-Only                    50.00%      0.00%      0.00%      0.00%     0.4898
Equal (50/50)                 56.00%     53.24%     98.67%     69.16%     0.6869
Video-Trust (80/20)           51.33%     50.68%    100.00%     67.26%     0.6857
Adaptive (τ=2.0)              56.00%     53.24%     98.67%     69.16%     0.6884
Adaptive (τ=3.0)              56.00%     53.24%     98.67%     69.16%     0.6850
---------------------------------------------------------------------------

BEST on LAV-DF: Equal (50/50) with 56.00% accuracy
