# Punjabi Speech Spoof Detection Evaluation
## Samsung PRISM Project

This notebook implements comprehensive evaluation metrics for spoof detection on the Punjabi speech dataset.

**Authors:** Harsh Partap Jain, Gurkirat Singh, Ashmit Singh

### Features:
- **Baseline**: MFCC + Cosine Similarity to Bonafide Centroid
- **Advanced**: LFCC + CQCC Features with Logistic Regression

### Evaluation Metrics:
- **EER** (Equal Error Rate)
- **minDCF** (Minimum Detection Cost Function)
- **actDCF** (Actual Detection Cost Function)
- **Cllr** (Log-Likelihood Ratio Cost)

In [None]:
# =============================================================================
# CELL 1: Import Required Libraries and Configuration
# =============================================================================

import os
import glob
import math
import numpy as np
import pandas as pd
import librosa
import scipy.fftpack
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import brentq
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# =============================================================================
# CONFIGURATION - Punjabi Dataset Paths
# =============================================================================
# Update these paths to your dataset location
PATH_BONAFIDE = r"C:\Users\Harsh Jain\Downloads\prism\Bonafide"
PATH_SPOOFED_1 = r"C:\Users\Harsh Jain\Downloads\prism\Spoofed-1"
PATH_SPOOFED_2 = r"C:\Users\Harsh Jain\Downloads\prism\Spoofed-2"

# Audio parameters
SAMPLE_RATE = 16000
MIN_DURATION = 0.2  # Skip files shorter than this (seconds)

# Feature extraction parameters
N_MFCC = 20
N_LFCC = 20
N_CQCC = 20
N_FFT = 512
HOP_LENGTH = 160
WIN_LENGTH = 400
N_FILTERS = 40

print("=" * 60)
print("PUNJABI SPEECH SPOOF DETECTION - Configuration")
print("=" * 60)
print(f"Bonafide Path: {PATH_BONAFIDE}")
print(f"Spoofed-1 Path: {PATH_SPOOFED_1}")
print(f"Spoofed-2 Path: {PATH_SPOOFED_2}")
print(f"Sample Rate: {SAMPLE_RATE} Hz")
print(f"MFCC Features: {N_MFCC}")
print(f"LFCC Features: {N_LFCC}")
print(f"CQCC Features: {N_CQCC}")

In [None]:
# =============================================================================
# CELL 2: Utility Functions
# =============================================================================

def list_audio_files(folder):
    """Recursively find all audio files in a folder."""
    extensions = ['wav', 'flac', 'mp3', 'ogg', 'm4a']
    files = []
    for ext in extensions:
        files.extend(glob.glob(os.path.join(folder, '**', f'*.{ext}'), recursive=True))
    return sorted(files)


def parse_punjabi_filename(filepath):
    """
    Parse Punjabi dataset filename convention.
    Format: pa_S{sentence}_{speaker}_{gender}_{device}_{condition}_{distance}_{angle}_{direction}_{noise}_{channel}_{mic}.wav
    Example: pa_S01_f1_female_IP14p_na_1m_90_east_57db_0_B.wav
    """
    filename = os.path.basename(filepath)
    name_without_ext = os.path.splitext(filename)[0]
    parts = name_without_ext.split('_')
    
    info = {
        'filename': filename,
        'filepath': filepath,
        'language': parts[0] if len(parts) > 0 else 'unknown',
        'sentence_id': parts[1] if len(parts) > 1 else 'unknown',
        'speaker_id': parts[2] if len(parts) > 2 else 'unknown',
        'gender': parts[3] if len(parts) > 3 else 'unknown',
        'device': parts[4] if len(parts) > 4 else 'unknown',
    }
    
    # Try to extract distance
    for part in parts:
        if part.endswith('m') and part[:-1].replace('.', '').isdigit():
            info['distance'] = part
            break
    
    return info


# Test utility functions
print("Testing utility functions...")
bona_files = list_audio_files(PATH_BONAFIDE)
spoof1_files = list_audio_files(PATH_SPOOFED_1)
spoof2_files = list_audio_files(PATH_SPOOFED_2)

print(f"\nFound {len(bona_files)} bonafide files")
print(f"Found {len(spoof1_files)} spoofed-1 files")
print(f"Found {len(spoof2_files)} spoofed-2 files")

if bona_files:
    print(f"\nSample filename parsing:")
    sample_info = parse_punjabi_filename(bona_files[0])
    for k, v in sample_info.items():
        print(f"  {k}: {v}")

## Feature Extraction Functions

Three types of features are extracted:
1. **MFCC** - Mel-Frequency Cepstral Coefficients (baseline)
2. **LFCC** - Linear Frequency Cepstral Coefficients
3. **CQCC** - Constant-Q Cepstral Coefficients

In [None]:
# =============================================================================
# CELL 3: MFCC Feature Extraction
# =============================================================================

def extract_mfcc_mean(filepath, sr=SAMPLE_RATE, n_mfcc=N_MFCC):
    """Extract mean MFCC features from audio file with L2 normalization."""
    try:
        y, _ = librosa.load(filepath, sr=sr, mono=True)
        if len(y) < sr * MIN_DURATION:
            print(f"[WARN] File too short: {filepath}")
            return None
        
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = np.mean(mfcc, axis=1)
        
        # L2 normalize
        norm = np.linalg.norm(mfcc_mean)
        if norm > 0:
            mfcc_mean = mfcc_mean / norm
        
        return mfcc_mean
    except Exception as e:
        print(f"[WARN] Failed to read {filepath}: {e}")
        return None


# Test MFCC extraction
if bona_files:
    print("Testing MFCC extraction...")
    test_mfcc = extract_mfcc_mean(bona_files[0])
    if test_mfcc is not None:
        print(f"MFCC shape: {test_mfcc.shape}")
        print(f"MFCC L2 norm: {np.linalg.norm(test_mfcc):.4f} (should be ~1.0)")

In [None]:
# =============================================================================
# CELL 4: LFCC Feature Extraction
# =============================================================================

def linear_filterbank(n_fft, sr, n_filters=N_FILTERS, fmin=0, fmax=None):
    """Create linear-spaced filterbank for LFCC."""
    if fmax is None:
        fmax = sr / 2
    
    freqs = np.linspace(fmin, fmax, n_filters + 2)
    bins = np.floor((n_fft + 1) * freqs / sr).astype(int)
    
    fb = np.zeros((n_filters, n_fft // 2 + 1))
    for i in range(1, n_filters + 1):
        l, c, r = bins[i-1], bins[i], bins[i+1]
        if c > l:
            fb[i-1, l:c] = (np.arange(l, c) - l) / (c - l)
        if r > c:
            fb[i-1, c:r] = (r - np.arange(c, r)) / (r - c)
    
    return fb


def extract_lfcc(y, sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH,
                 win_length=WIN_LENGTH, n_filters=N_FILTERS, n_ceps=N_LFCC):
    """Extract LFCC (Linear Frequency Cepstral Coefficients) features."""
    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)) ** 2
    fb = linear_filterbank(n_fft, sr, n_filters=n_filters)
    
    feat = np.dot(fb, S[:n_fft // 2 + 1, :])
    feat[feat == 0] = 1e-8
    log_feat = np.log(feat)
    
    ceps = scipy.fftpack.dct(log_feat, axis=0, norm='ortho')[:n_ceps, :]
    ceps = (ceps - ceps.mean(axis=1, keepdims=True)) / (ceps.std(axis=1, keepdims=True) + 1e-9)
    
    # Return mean and std as feature vector
    feature_vector = np.hstack([ceps.mean(axis=1), ceps.std(axis=1)])
    return feature_vector


# Test LFCC extraction
if bona_files:
    print("Testing LFCC extraction...")
    y_test, _ = librosa.load(bona_files[0], sr=SAMPLE_RATE, mono=True)
    test_lfcc = extract_lfcc(y_test)
    print(f"LFCC feature shape: {test_lfcc.shape}")

In [None]:
# =============================================================================
# CELL 5: CQCC Feature Extraction
# =============================================================================

def extract_cqcc(y, sr=SAMPLE_RATE, bins_per_octave=24, n_octaves=7, n_ceps=N_CQCC):
    """Extract CQCC (Constant-Q Cepstral Coefficients) features."""
    fmin = 20.0
    n_bins = n_octaves * bins_per_octave
    
    C = librosa.cqt(y, sr=sr, hop_length=HOP_LENGTH, fmin=fmin, 
                    n_bins=n_bins, bins_per_octave=bins_per_octave)
    C_mag = np.abs(C)
    C_mag[C_mag == 0] = 1e-8
    
    logC = np.log(C_mag)
    ceps = scipy.fftpack.dct(logC, axis=0, norm='ortho')[:n_ceps, :]
    ceps = (ceps - ceps.mean(axis=1, keepdims=True)) / (ceps.std(axis=1, keepdims=True) + 1e-9)
    
    # Return mean and std as feature vector
    feature_vector = np.hstack([ceps.mean(axis=1), ceps.std(axis=1)])
    return feature_vector


def extract_combined_features(filepath, sr=SAMPLE_RATE):
    """Extract combined LFCC + CQCC features from audio file."""
    try:
        y, _ = librosa.load(filepath, sr=sr, mono=True)
        if len(y) < sr * MIN_DURATION:
            return None
        
        lfcc_feat = extract_lfcc(y, sr=sr)
        cqcc_feat = extract_cqcc(y, sr=sr)
        
        return np.hstack([lfcc_feat, cqcc_feat])
    except Exception as e:
        print(f"[WARN] Failed to extract features from {filepath}: {e}")
        return None


# Test CQCC and combined extraction
if bona_files:
    print("Testing CQCC extraction...")
    test_cqcc = extract_cqcc(y_test)
    print(f"CQCC feature shape: {test_cqcc.shape}")
    
    print("\nTesting combined LFCC+CQCC extraction...")
    test_combined = extract_combined_features(bona_files[0])
    if test_combined is not None:
        print(f"Combined feature shape: {test_combined.shape}")
        print(f"  - LFCC: {N_LFCC * 2} features (mean + std)")
        print(f"  - CQCC: {N_CQCC * 2} features (mean + std)")

## Evaluation Metrics

Standard metrics for spoof detection evaluation:
- **EER** - Equal Error Rate (where FPR = FNR)
- **minDCF** - Minimum Detection Cost Function
- **actDCF** - Actual DCF at a given threshold
- **Cllr** - Log-Likelihood Ratio Cost

In [None]:
# =============================================================================
# CELL 6: Evaluation Metrics Functions
# =============================================================================

def compute_eer(labels, scores, pos_label=1):
    """
    Compute Equal Error Rate (EER).
    
    Returns:
        eer: Equal Error Rate (0-1)
        threshold: Threshold at EER
        fpr, tpr, thresholds: ROC curve data
    """
    fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=pos_label)
    fnr = 1 - tpr
    
    abs_diffs = np.abs(fpr - fnr)
    idx = np.nanargmin(abs_diffs)
    
    eer = (fpr[idx] + fnr[idx]) / 2.0
    threshold = thresholds[idx]
    
    return eer, threshold, fpr, tpr, thresholds


def compute_min_dcf(labels, scores, beta=1.9, C_miss=1.0, C_fa=10.0, pos_label=1):
    """
    Compute Minimum Detection Cost Function (minDCF).
    
    Returns:
        min_dcf: Minimum DCF value
        threshold: Threshold at minDCF
    """
    fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=pos_label)
    fnr = 1 - tpr
    
    dcf_vals = beta * fnr * C_miss + fpr * C_fa
    idx = np.argmin(dcf_vals)
    
    return dcf_vals[idx], thresholds[idx]


def compute_act_dcf(labels, scores, threshold, beta=1.9, C_miss=1.0, C_fa=10.0):
    """
    Compute Actual Detection Cost Function (actDCF) at a given threshold.
    
    Returns:
        act_dcf: Actual DCF value
        P_miss: Miss probability
        P_fa: False alarm probability
    """
    preds = (scores >= threshold).astype(int)
    
    pos_count = np.sum(labels == 1)
    neg_count = np.sum(labels == 0)
    
    if pos_count == 0 or neg_count == 0:
        return float('nan'), None, None
    
    P_miss = np.sum((labels == 1) & (preds == 0)) / pos_count
    P_fa = np.sum((labels == 0) & (preds == 1)) / neg_count
    
    act_dcf = beta * P_miss * C_miss + P_fa * C_fa
    
    return act_dcf, P_miss, P_fa


def compute_cllr(labels, scores):
    """
    Compute Log-Likelihood Ratio Cost (Cllr) in bits.
    Note: Treats scores as LLR-like for baseline.
    """
    s = np.array(scores)
    pos = s[labels == 1]
    neg = s[labels == 0]
    
    eps = 1e-300
    
    if len(pos) == 0 or len(neg) == 0:
        return float('nan')
    
    def log_cost_pos(x):
        return np.log(1 + 1 / (np.exp(x) + eps))
    
    def log_cost_neg(x):
        return np.log(1 + np.exp(x))
    
    cllr = np.mean(log_cost_pos(pos)) + np.mean(log_cost_neg(neg))
    cllr_bits = cllr / math.log(2)
    
    return cllr_bits


def compute_cllr_proba(scores, labels):
    """Compute Cllr from probability scores (0-1 range)."""
    eps = 1e-15
    scores = np.clip(scores, eps, 1 - eps)
    llr = np.log(scores / (1 - scores))
    cllr = np.mean(np.log2(1 + np.exp(-llr * (2 * labels - 1))))
    return cllr


print("Evaluation metric functions defined successfully!")

## Baseline Evaluation: MFCC + Cosine Similarity

This approach:
1. Extracts mean MFCC vectors from all audio files
2. Computes centroid of bonafide samples
3. Scores each sample by cosine similarity to bonafide centroid
4. Higher score = more likely genuine

In [None]:
# =============================================================================
# CELL 7: Baseline MFCC Evaluation - Spoofed-1 vs Bonafide
# =============================================================================

print("=" * 60)
print("BASELINE MFCC EVALUATION: Spoofed-1 vs Bonafide")
print("=" * 60)

# Extract features
records = []

# Bonafide (label=1)
print("\nExtracting MFCC from bonafide files...")
for p in bona_files:
    feats = extract_mfcc_mean(p)
    if feats is not None:
        records.append({'path': p, 'label': 1, 'feat': feats})

# Spoofed-1 (label=0)
print("Extracting MFCC from spoofed-1 files...")
for p in spoof1_files:
    feats = extract_mfcc_mean(p)
    if feats is not None:
        records.append({'path': p, 'label': 0, 'feat': feats})

print(f"\nTotal samples: {len(records)}")
print(f"  - Bonafide: {sum(1 for r in records if r['label']==1)}")
print(f"  - Spoofed: {sum(1 for r in records if r['label']==0)}")

# Compute bonafide centroid
feats_pos = np.stack([r['feat'] for r in records if r['label'] == 1])
centroid = np.mean(feats_pos, axis=0)
norm = np.linalg.norm(centroid)
if norm > 0:
    centroid = centroid / norm

# Score all trials
scores_baseline = []
labels_baseline = []
for r in records:
    score = float(cosine_similarity(r['feat'].reshape(1, -1), centroid.reshape(1, -1))[0, 0])
    scores_baseline.append(score)
    labels_baseline.append(r['label'])

scores_baseline = np.array(scores_baseline)
labels_baseline = np.array(labels_baseline)

# Compute metrics
eer, eer_thr, fpr, tpr, _ = compute_eer(labels_baseline, scores_baseline)
min_dcf, min_dcf_thr = compute_min_dcf(labels_baseline, scores_baseline)
act_dcf, P_miss, P_fa = compute_act_dcf(labels_baseline, scores_baseline, eer_thr)
cllr = compute_cllr(labels_baseline, scores_baseline)

print("\n" + "=" * 60)
print("BASELINE RESULTS (Spoofed-1 vs Bonafide)")
print("=" * 60)
print(f"EER: {eer * 100:.2f}%")
print(f"EER Threshold: {eer_thr:.4f}")
print(f"minDCF: {min_dcf:.4f}")
print(f"actDCF (at EER threshold): {act_dcf:.4f}")
print(f"Cllr (bits): {cllr:.4f}")

# Store for later comparison
baseline_s1_results = {
    'eer': eer, 'eer_pct': eer * 100, 'min_dcf': min_dcf, 'cllr': cllr,
    'fpr': fpr, 'tpr': tpr
}

## Advanced Evaluation: LFCC + CQCC with Logistic Regression

This approach:
1. Extracts combined LFCC and CQCC features
2. Trains Logistic Regression classifier
3. Evaluates with train/test split (80/20)

In [None]:
# =============================================================================
# CELL 8: Advanced LFCC+CQCC Evaluation - Spoofed-1 vs Bonafide
# =============================================================================

print("=" * 60)
print("ADVANCED LFCC+CQCC EVALUATION: Spoofed-1 vs Bonafide")
print("=" * 60)

# Extract combined features
def load_combined_features(files, label):
    X, y = [], []
    for f in files:
        feat = extract_combined_features(f)
        if feat is not None:
            X.append(feat)
            y.append(label)
    return np.array(X), np.array(y)

print("\nExtracting LFCC+CQCC from bonafide files...")
X_bona, y_bona = load_combined_features(bona_files, 0)  # Bonafide = 0

print("Extracting LFCC+CQCC from spoofed-1 files...")
X_spoof, y_spoof = load_combined_features(spoof1_files, 1)  # Spoofed = 1

X_adv = np.vstack([X_bona, X_spoof])
y_adv = np.hstack([y_bona, y_spoof])

print(f"\nTotal samples: {len(y_adv)}")
print(f"Feature dimension: {X_adv.shape[1]}")

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_adv, y_adv, test_size=0.2, stratify=y_adv, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTrain set: {len(y_train)} samples")
print(f"Test set: {len(y_test)} samples")

# Train classifier
print("\nTraining Logistic Regression...")
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train_scaled, y_train)

# Get scores
scores_adv = clf.predict_proba(X_test_scaled)[:, 1]

train_acc = clf.score(X_train_scaled, y_train)
test_acc = clf.score(X_test_scaled, y_test)
print(f"Training accuracy: {train_acc * 100:.2f}%")
print(f"Test accuracy: {test_acc * 100:.2f}%")

# Compute EER
fpr_adv, tpr_adv, thresholds_adv = roc_curve(y_test, scores_adv)
fnr_adv = 1 - tpr_adv

try:
    eer_adv = brentq(lambda x: 1. - x - interp1d(fpr_adv, tpr_adv)(x), 0., 1.)
    eer_thresh_adv = float(interp1d(fpr_adv, thresholds_adv)(eer_adv))
except:
    abs_diffs = np.abs(fpr_adv - fnr_adv)
    idx = np.nanargmin(abs_diffs)
    eer_adv = (fpr_adv[idx] + fnr_adv[idx]) / 2.0
    eer_thresh_adv = thresholds_adv[idx]

# Compute other metrics
def min_dcf_simple(scores, labels, C_miss=1, C_fa=10, pi_spoof=0.05):
    thresholds_sorted = np.sort(scores)
    min_cost = np.inf
    for t in thresholds_sorted:
        P_miss = np.mean((scores[labels == 0] < t).astype(float))
        P_fa = np.mean((scores[labels == 1] >= t).astype(float))
        cost = C_miss * pi_spoof * P_miss + C_fa * (1 - pi_spoof) * P_fa
        if cost < min_cost:
            min_cost = cost
    return min_cost

min_dcf_adv = min_dcf_simple(scores_adv, y_test)
cllr_adv = compute_cllr_proba(scores_adv, y_test)
auc_adv = roc_auc_score(y_test, scores_adv)

print("\n" + "=" * 60)
print("ADVANCED RESULTS (Spoofed-1 vs Bonafide)")
print("=" * 60)
print(f"EER: {eer_adv * 100:.2f}%")
print(f"minDCF: {min_dcf_adv:.4f}")
print(f"Cllr (bits): {cllr_adv:.4f}")
print(f"AUC: {auc_adv:.4f}")

# Store for comparison
advanced_s1_results = {
    'eer': eer_adv, 'eer_pct': eer_adv * 100, 'min_dcf': min_dcf_adv, 
    'cllr': cllr_adv, 'auc': auc_adv, 'fpr': fpr_adv, 'tpr': tpr_adv
}

## Visualization: ROC Curves and Comparison

In [None]:
# =============================================================================
# CELL 9: Visualization - ROC Curves and Confusion Matrix
# =============================================================================

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# ROC Curve - Baseline
ax1 = axes[0]
ax1.plot(baseline_s1_results['fpr'], baseline_s1_results['tpr'], 'b-', linewidth=2, label='Baseline MFCC')
ax1.plot([0, 1], [0, 1], 'k--', linewidth=1)
ax1.scatter([baseline_s1_results['eer']], [1 - baseline_s1_results['eer']], 
            marker='x', color='red', s=100, linewidths=3,
            label=f"EER = {baseline_s1_results['eer_pct']:.2f}%")
ax1.set_xlabel("False Positive Rate", fontsize=12)
ax1.set_ylabel("True Positive Rate", fontsize=12)
ax1.set_title("ROC Curve - Baseline MFCC", fontsize=14)
ax1.legend(loc='lower right')
ax1.grid(True, alpha=0.3)

# ROC Curve - Advanced
ax2 = axes[1]
ax2.plot(advanced_s1_results['fpr'], advanced_s1_results['tpr'], 'g-', linewidth=2, 
         label=f"Advanced LFCC+CQCC (AUC={advanced_s1_results['auc']:.3f})")
ax2.plot([0, 1], [0, 1], 'k--', linewidth=1)
ax2.scatter([advanced_s1_results['eer']], [1 - advanced_s1_results['eer']], 
            marker='x', color='red', s=100, linewidths=3,
            label=f"EER = {advanced_s1_results['eer_pct']:.2f}%")
ax2.set_xlabel("False Positive Rate", fontsize=12)
ax2.set_ylabel("True Positive Rate", fontsize=12)
ax2.set_title("ROC Curve - Advanced LFCC+CQCC", fontsize=14)
ax2.legend(loc='lower right')
ax2.grid(True, alpha=0.3)

# Comparison - Both on same plot
ax3 = axes[2]
ax3.plot(baseline_s1_results['fpr'], baseline_s1_results['tpr'], 'b-', linewidth=2, 
         label=f"Baseline (EER={baseline_s1_results['eer_pct']:.1f}%)")
ax3.plot(advanced_s1_results['fpr'], advanced_s1_results['tpr'], 'g-', linewidth=2, 
         label=f"Advanced (EER={advanced_s1_results['eer_pct']:.1f}%)")
ax3.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random')
ax3.set_xlabel("False Positive Rate", fontsize=12)
ax3.set_ylabel("True Positive Rate", fontsize=12)
ax3.set_title("ROC Comparison - Baseline vs Advanced", fontsize=14)
ax3.legend(loc='lower right')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("punjabi_evaluation/roc_comparison_spoofed1.png", dpi=150, bbox_inches='tight')
print("Saved ROC comparison to punjabi_evaluation/roc_comparison_spoofed1.png")
plt.show()

## Full Comparison: Spoofed-1 AND Spoofed-2 Evaluation

Run the complete evaluation for both spoof types and compare results.

In [None]:
# =============================================================================
# CELL 10: Full Comparison - Spoofed-1 AND Spoofed-2
# =============================================================================

print("=" * 70)
print("EVALUATING SPOOFED-2 vs BONAFIDE")
print("=" * 70)

# Baseline for Spoofed-2
print("\n--- Baseline MFCC: Spoofed-2 vs Bonafide ---")
records_s2 = []

for p in bona_files:
    feats = extract_mfcc_mean(p)
    if feats is not None:
        records_s2.append({'path': p, 'label': 1, 'feat': feats})

for p in spoof2_files:
    feats = extract_mfcc_mean(p)
    if feats is not None:
        records_s2.append({'path': p, 'label': 0, 'feat': feats})

# Use same centroid from bonafide
scores_baseline_s2 = []
labels_baseline_s2 = []
for r in records_s2:
    score = float(cosine_similarity(r['feat'].reshape(1, -1), centroid.reshape(1, -1))[0, 0])
    scores_baseline_s2.append(score)
    labels_baseline_s2.append(r['label'])

scores_baseline_s2 = np.array(scores_baseline_s2)
labels_baseline_s2 = np.array(labels_baseline_s2)

eer_s2_base, _, fpr_s2_base, tpr_s2_base, _ = compute_eer(labels_baseline_s2, scores_baseline_s2)
min_dcf_s2_base, _ = compute_min_dcf(labels_baseline_s2, scores_baseline_s2)
cllr_s2_base = compute_cllr(labels_baseline_s2, scores_baseline_s2)

print(f"EER: {eer_s2_base * 100:.2f}%")
print(f"minDCF: {min_dcf_s2_base:.4f}")
print(f"Cllr: {cllr_s2_base:.4f}")

baseline_s2_results = {
    'eer': eer_s2_base, 'eer_pct': eer_s2_base * 100, 
    'min_dcf': min_dcf_s2_base, 'cllr': cllr_s2_base,
    'fpr': fpr_s2_base, 'tpr': tpr_s2_base
}

# Advanced for Spoofed-2
print("\n--- Advanced LFCC+CQCC: Spoofed-2 vs Bonafide ---")
X_spoof2, y_spoof2 = load_combined_features(spoof2_files, 1)
X_adv_s2 = np.vstack([X_bona, X_spoof2])
y_adv_s2 = np.hstack([y_bona, y_spoof2])

X_train_s2, X_test_s2, y_train_s2, y_test_s2 = train_test_split(
    X_adv_s2, y_adv_s2, test_size=0.2, stratify=y_adv_s2, random_state=42
)

scaler_s2 = StandardScaler()
X_train_s2 = scaler_s2.fit_transform(X_train_s2)
X_test_s2 = scaler_s2.transform(X_test_s2)

clf_s2 = LogisticRegression(max_iter=1000, random_state=42)
clf_s2.fit(X_train_s2, y_train_s2)
scores_adv_s2 = clf_s2.predict_proba(X_test_s2)[:, 1]

fpr_adv_s2, tpr_adv_s2, thresholds_adv_s2 = roc_curve(y_test_s2, scores_adv_s2)
try:
    eer_adv_s2 = brentq(lambda x: 1. - x - interp1d(fpr_adv_s2, tpr_adv_s2)(x), 0., 1.)
except:
    fnr_adv_s2 = 1 - tpr_adv_s2
    idx = np.nanargmin(np.abs(fpr_adv_s2 - fnr_adv_s2))
    eer_adv_s2 = (fpr_adv_s2[idx] + fnr_adv_s2[idx]) / 2.0

min_dcf_adv_s2 = min_dcf_simple(scores_adv_s2, y_test_s2)
cllr_adv_s2 = compute_cllr_proba(scores_adv_s2, y_test_s2)
auc_adv_s2 = roc_auc_score(y_test_s2, scores_adv_s2)

print(f"EER: {eer_adv_s2 * 100:.2f}%")
print(f"minDCF: {min_dcf_adv_s2:.4f}")
print(f"Cllr: {cllr_adv_s2:.4f}")
print(f"AUC: {auc_adv_s2:.4f}")

advanced_s2_results = {
    'eer': eer_adv_s2, 'eer_pct': eer_adv_s2 * 100,
    'min_dcf': min_dcf_adv_s2, 'cllr': cllr_adv_s2, 'auc': auc_adv_s2,
    'fpr': fpr_adv_s2, 'tpr': tpr_adv_s2
}

# =============================================
# SUMMARY COMPARISON TABLE
# =============================================
print("\n\n" + "=" * 70)
print("SUMMARY COMPARISON - ALL EVALUATIONS")
print("=" * 70)

summary_data = [
    {'Evaluation': 'Spoofed-1 (Baseline MFCC)', 'EER (%)': baseline_s1_results['eer_pct'], 
     'minDCF': baseline_s1_results['min_dcf'], 'Cllr (bits)': baseline_s1_results['cllr']},
    {'Evaluation': 'Spoofed-1 (Advanced LFCC+CQCC)', 'EER (%)': advanced_s1_results['eer_pct'], 
     'minDCF': advanced_s1_results['min_dcf'], 'Cllr (bits)': advanced_s1_results['cllr']},
    {'Evaluation': 'Spoofed-2 (Baseline MFCC)', 'EER (%)': baseline_s2_results['eer_pct'], 
     'minDCF': baseline_s2_results['min_dcf'], 'Cllr (bits)': baseline_s2_results['cllr']},
    {'Evaluation': 'Spoofed-2 (Advanced LFCC+CQCC)', 'EER (%)': advanced_s2_results['eer_pct'], 
     'minDCF': advanced_s2_results['min_dcf'], 'Cllr (bits)': advanced_s2_results['cllr']},
]

summary_df = pd.DataFrame(summary_data)
print("\n")
print(summary_df.to_string(index=False))

# Save summary
summary_df.to_csv("punjabi_evaluation/summary_comparison.csv", index=False)
print("\n\nSaved summary to punjabi_evaluation/summary_comparison.csv")

In [None]:
# =============================================================================
# CELL 11: Final Visualization - Complete Comparison
# =============================================================================

fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Spoofed-1 Baseline
ax1 = axes[0, 0]
ax1.plot(baseline_s1_results['fpr'], baseline_s1_results['tpr'], 'b-', linewidth=2)
ax1.plot([0, 1], [0, 1], 'k--', linewidth=1)
ax1.scatter([baseline_s1_results['eer']], [1 - baseline_s1_results['eer']], 
            marker='x', color='red', s=100, linewidths=3)
ax1.set_xlabel("FPR")
ax1.set_ylabel("TPR")
ax1.set_title(f"Spoofed-1 Baseline (EER={baseline_s1_results['eer_pct']:.1f}%)")
ax1.grid(True, alpha=0.3)

# Spoofed-1 Advanced
ax2 = axes[0, 1]
ax2.plot(advanced_s1_results['fpr'], advanced_s1_results['tpr'], 'g-', linewidth=2)
ax2.plot([0, 1], [0, 1], 'k--', linewidth=1)
ax2.scatter([advanced_s1_results['eer']], [1 - advanced_s1_results['eer']], 
            marker='x', color='red', s=100, linewidths=3)
ax2.set_xlabel("FPR")
ax2.set_ylabel("TPR")
ax2.set_title(f"Spoofed-1 Advanced (EER={advanced_s1_results['eer_pct']:.1f}%)")
ax2.grid(True, alpha=0.3)

# Spoofed-2 Baseline
ax3 = axes[1, 0]
ax3.plot(baseline_s2_results['fpr'], baseline_s2_results['tpr'], 'b-', linewidth=2)
ax3.plot([0, 1], [0, 1], 'k--', linewidth=1)
ax3.scatter([baseline_s2_results['eer']], [1 - baseline_s2_results['eer']], 
            marker='x', color='red', s=100, linewidths=3)
ax3.set_xlabel("FPR")
ax3.set_ylabel("TPR")
ax3.set_title(f"Spoofed-2 Baseline (EER={baseline_s2_results['eer_pct']:.1f}%)")
ax3.grid(True, alpha=0.3)

# Spoofed-2 Advanced
ax4 = axes[1, 1]
ax4.plot(advanced_s2_results['fpr'], advanced_s2_results['tpr'], 'g-', linewidth=2)
ax4.plot([0, 1], [0, 1], 'k--', linewidth=1)
ax4.scatter([advanced_s2_results['eer']], [1 - advanced_s2_results['eer']], 
            marker='x', color='red', s=100, linewidths=3)
ax4.set_xlabel("FPR")
ax4.set_ylabel("TPR")
ax4.set_title(f"Spoofed-2 Advanced (EER={advanced_s2_results['eer_pct']:.1f}%)")
ax4.grid(True, alpha=0.3)

plt.suptitle("Punjabi Spoof Detection - ROC Curves Comparison\nSamsung PRISM Project", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig("punjabi_evaluation/full_roc_comparison.png", dpi=150, bbox_inches='tight')
print("Saved full ROC comparison to punjabi_evaluation/full_roc_comparison.png")
plt.show()

# Bar chart comparison
fig, ax = plt.subplots(figsize=(10, 6))

methods = ['S1-Baseline', 'S1-Advanced', 'S2-Baseline', 'S2-Advanced']
eer_values = [baseline_s1_results['eer_pct'], advanced_s1_results['eer_pct'],
              baseline_s2_results['eer_pct'], advanced_s2_results['eer_pct']]
colors = ['#3498db', '#2ecc71', '#3498db', '#2ecc71']

bars = ax.bar(methods, eer_values, color=colors, edgecolor='black', linewidth=1.5)
ax.set_ylabel("EER (%)", fontsize=12)
ax.set_title("Equal Error Rate Comparison\nPunjabi Spoof Detection", fontsize=14)
ax.set_ylim(0, max(eer_values) * 1.2)

# Add value labels
for bar, val in zip(bars, eer_values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
            f'{val:.1f}%', ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig("punjabi_evaluation/eer_comparison_bar.png", dpi=150, bbox_inches='tight')
print("Saved EER bar chart to punjabi_evaluation/eer_comparison_bar.png")
plt.show()

print("\n" + "=" * 70)
print("EVALUATION COMPLETE!")
print("=" * 70)
print("\nOutput files saved to: punjabi_evaluation/")
print("  - summary_comparison.csv")
print("  - roc_comparison_spoofed1.png")
print("  - full_roc_comparison.png")
print("  - eer_comparison_bar.png")