# Out-of-Distribution (OOD) Evaluation

This notebook evaluates model robustness on out-of-distribution music samples—tracks that differ systematically from the training distribution. This is critical for understanding real-world deployment performance.

In [None]:
import sys
sys.path.insert(0, '../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

print("Out-of-Distribution Robustness Evaluation")
print("="*70)

## 1. Create OOD Datasets

Generate systematically different distributions to simulate real-world scenarios.

In [None]:
np.random.seed(42)

# In-distribution training data
n_train = 2000
n_genres = 8
genres = ['rock', 'electronic', 'hip-hop', 'classical', 'jazz', 'folk', 'pop', 'experimental']

X_in_dist = np.random.normal(0, 1, (n_train, 20))
y_train = np.random.randint(0, n_genres, n_train)

# In-distribution test data
n_test = 500
X_in_dist_test = np.random.normal(0, 1, (n_test, 20))
y_in_dist_test = np.random.randint(0, n_genres, n_test)

print(f"In-distribution training: {X_in_dist.shape}")
print(f"In-distribution test: {X_in_dist_test.shape}")

# Create OOD datasets with different characteristics
print("\nGenerating OOD datasets...")

# OOD 1: Shifted mean (music with different loudness/energy)
X_ood_shifted = np.random.normal(2.0, 1, (n_test, 20))
y_ood_shifted = np.random.randint(0, n_genres, n_test)
print("  ✓ OOD Shifted: Mean shift of 2.0 (different audio energy)")

# OOD 2: Increased variance (extreme dynamics)
X_ood_high_var = np.random.normal(0, 3, (n_test, 20))
y_ood_high_var = np.random.randint(0, n_genres, n_test)
print("  ✓ OOD High Variance: 3x increased std dev (extreme dynamics)")

# OOD 3: Sparse features (missing information / lossy compression)
X_ood_sparse = np.random.choice([0, np.nan], (n_test, 20), p=[0.3, 0.7])
X_ood_sparse = np.nan_to_num(X_ood_sparse)  # Replace NaN with 0
y_ood_sparse = np.random.randint(0, n_genres, n_test)
print("  ✓ OOD Sparse: 70% missing features (compressed audio)")

# OOD 4: Corrupted features (noise / degraded audio)
X_ood_noisy = np.random.normal(0, 1, (n_test, 20)) + np.random.normal(0, 2, (n_test, 20))
y_ood_noisy = np.random.randint(0, n_genres, n_test)
print("  ✓ OOD Noisy: Added Gaussian noise (degraded quality)")

# OOD 5: Bimodal distribution (mixture of genres / covers)
X_ood_bimodal_1 = np.random.normal(-1.5, 0.8, (n_test//2, 20))
X_ood_bimodal_2 = np.random.normal(1.5, 0.8, (n_test//2, 20))
X_ood_bimodal = np.vstack([X_ood_bimodal_1, X_ood_bimodal_2])
y_ood_bimodal = np.random.randint(0, n_genres, n_test)
print("  ✓ OOD Bimodal: Mixture of two distributions (cover songs)")

## 2. Train Model on In-Distribution Data

Train model on standard training set.

In [None]:
# Standardize in-distribution data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_in_dist)
X_test_scaled = scaler.transform(X_in_dist_test)

# Train model
model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)

# Evaluate on in-distribution test set
y_pred_in = model.predict(X_test_scaled)
y_pred_proba_in = model.predict_proba(X_test_scaled)

in_dist_accuracy = accuracy_score(y_in_dist_test, y_pred_in)
in_dist_f1 = f1_score(y_in_dist_test, y_pred_in, average='weighted')

print(f"In-Distribution Performance:")
print(f"  Accuracy:  {in_dist_accuracy:.4f}")
print(f"  F1-Score:  {in_dist_f1:.4f}")

## 3. Evaluate on OOD Datasets

Test model robustness across different OOD scenarios.

In [None]:
# Function to evaluate on OOD data
def evaluate_ood(X_ood, y_ood, scaler, model, ood_name):
    """Evaluate model on OOD data."""
    try:
        X_ood_scaled = scaler.transform(X_ood)
    except:
        # Handle NaN or inf values
        X_ood_scaled = np.nan_to_num(scaler.transform(X_ood), nan=0.0, posinf=1.0, neginf=-1.0)
    
    y_pred_ood = model.predict(X_ood_scaled)
    y_pred_proba_ood = model.predict_proba(X_ood_scaled)
    
    accuracy = accuracy_score(y_ood, y_pred_ood)
    f1 = f1_score(y_ood, y_pred_ood, average='weighted')
    
    # Compute confidence metrics
    max_probs = np.max(y_pred_proba_ood, axis=1)
    mean_confidence = max_probs.mean()
    entropy = -np.sum(y_pred_proba_ood * np.log(y_pred_proba_ood + 1e-10), axis=1).mean()
    
    return {
        'OOD Type': ood_name,
        'Accuracy': accuracy,
        'F1-Score': f1,
        'Mean Confidence': mean_confidence,
        'Entropy': entropy,
        'Performance Drop': in_dist_accuracy - accuracy
    }

# Evaluate all OOD scenarios
ood_results = []
ood_results.append(evaluate_ood(X_in_dist_test, y_in_dist_test, scaler, model, 'In-Distribution'))
ood_results.append(evaluate_ood(X_ood_shifted, y_ood_shifted, scaler, model, 'Shifted Mean'))
ood_results.append(evaluate_ood(X_ood_high_var, y_ood_high_var, scaler, model, 'High Variance'))
ood_results.append(evaluate_ood(X_ood_sparse, y_ood_sparse, scaler, model, 'Sparse Features'))
ood_results.append(evaluate_ood(X_ood_noisy, y_ood_noisy, scaler, model, 'Noisy'))
ood_results.append(evaluate_ood(X_ood_bimodal, y_ood_bimodal, scaler, model, 'Bimodal'))

results_df = pd.DataFrame(ood_results)

print("\n" + "="*70)
print("OUT-OF-DISTRIBUTION EVALUATION RESULTS")
print("="*70)
print(results_df.to_string(index=False))

## 4. Visualize OOD Performance

Compare model performance across different OOD scenarios.

In [None]:
# Performance comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Out-of-Distribution Robustness Analysis', fontweight='bold', fontsize=16)

# Plot 1: Accuracy comparison
ax = axes[0, 0]
colors = ['green' if x == 'In-Distribution' else 'orange' if x.startswith('Shifted') else 'red' 
         for x in results_df['OOD Type']]
bars1 = ax.bar(range(len(results_df)), results_df['Accuracy'], color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.axhline(y=in_dist_accuracy, color='green', linestyle='--', linewidth=2, label='In-Distribution Baseline')
ax.set_xticks(range(len(results_df)))
ax.set_xticklabels(results_df['OOD Type'], rotation=45, ha='right')
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy Across OOD Scenarios')
ax.set_ylim([0, 1.0])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Plot 2: F1-Score comparison
ax = axes[0, 1]
bars2 = ax.bar(range(len(results_df)), results_df['F1-Score'], color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.axhline(y=in_dist_f1, color='green', linestyle='--', linewidth=2, label='In-Distribution Baseline')
ax.set_xticks(range(len(results_df)))
ax.set_xticklabels(results_df['OOD Type'], rotation=45, ha='right')
ax.set_ylabel('F1-Score')
ax.set_title('F1-Score Across OOD Scenarios')
ax.set_ylim([0, 1.0])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Plot 3: Performance drop
ax = axes[1, 0]
perf_drop = results_df['Performance Drop']
drop_colors = ['green' if x <= 0 else 'orange' if x < 0.15 else 'red' for x in perf_drop]
bars3 = ax.bar(range(len(results_df)), perf_drop, color=drop_colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax.axhline(y=0, color='black', linestyle='-', linewidth=1)
ax.set_xticks(range(len(results_df)))
ax.set_xticklabels(results_df['OOD Type'], rotation=45, ha='right')
ax.set_ylabel('Performance Drop')
ax.set_title('Accuracy Loss on OOD Data')
ax.grid(True, alpha=0.3, axis='y')

# Plot 4: Confidence vs Entropy
ax = axes[1, 1]
scatter = ax.scatter(results_df['Mean Confidence'], results_df['Entropy'],
                    s=300, c=range(len(results_df)), cmap='viridis', alpha=0.7, edgecolor='black', linewidth=1.5)
for i, ood_type in enumerate(results_df['OOD Type']):
    ax.annotate(ood_type, (results_df['Mean Confidence'].iloc[i], results_df['Entropy'].iloc[i]),
               fontsize=8, ha='center', va='bottom')
ax.set_xlabel('Mean Prediction Confidence')
ax.set_ylabel('Prediction Entropy')
ax.set_title('Confidence vs. Uncertainty')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/ood_robustness_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Robustness Metrics

Compute quantitative robustness measures.

In [None]:
# Compute robustness metrics
print("\n" + "="*70)
print("ROBUSTNESS METRICS")
print("="*70)

# 1. Average performance drop
ood_only = results_df[results_df['OOD Type'] != 'In-Distribution']
avg_drop = ood_only['Performance Drop'].mean()
max_drop = ood_only['Performance Drop'].max()

print(f"\nPerformance Degradation:")
print(f"  Average drop across OOD scenarios: {avg_drop:.2%}")
print(f"  Maximum drop: {max_drop:.2%} ({ood_only.loc[ood_only['Performance Drop'].idxmax(), 'OOD Type']})")

# 2. Robustness score (inverse of average drop)
robustness_score = 1 - avg_drop
print(f"  Robustness Score: {robustness_score:.2%}")

# 3. Worst-case scenario
worst_accuracy = ood_only['Accuracy'].min()
worst_scenario = ood_only.loc[ood_only['Accuracy'].idxmin(), 'OOD Type']
print(f"\nWorst-Case Performance:")
print(f"  Scenario: {worst_scenario}")
print(f"  Accuracy: {worst_accuracy:.2%}")

# 4. Confidence calibration on OOD
print(f"\nConfidence Calibration:")
print(f"  In-distribution mean confidence: {results_df.loc[0, 'Mean Confidence']:.4f}")
print(f"  OOD mean confidence: {ood_only['Mean Confidence'].mean():.4f}")
print(f"  Entropy increase: {ood_only['Entropy'].mean() - results_df.loc[0, 'Entropy']:.4f}")

print(f"\nInterpretation:")
if robustness_score > 0.9:
    print("  ✓ Excellent robustness: Model maintains performance on OOD data")
elif robustness_score > 0.8:
    print("  ✓ Good robustness: Minor performance degradation on OOD data")
elif robustness_score > 0.7:
    print("  ⚠ Moderate robustness: Noticeable performance degradation on OOD data")
else:
    print("  ✗ Poor robustness: Significant performance degradation on OOD data")

## 6. Per-Scenario Analysis

Deep dive into each OOD scenario.

In [None]:
print("\n" + "="*70)
print("PER-SCENARIO ANALYSIS")
print("="*70)

scenarios = [
    ("Shifted Mean", "Audio with different energy/loudness profiles (e.g., live vs. studio recordings)"),
    ("High Variance", "Extreme dynamics (e.g., dynamic classical music)"),
    ("Sparse Features", "Lossy compression artifacts (e.g., MP3-compressed audio)"),
    ("Noisy", "Degraded audio quality (e.g., low-bitrate streaming)"),
    ("Bimodal", "Hybrid genres / cover songs (e.g., acoustic pop cover)")
]

for scenario_name, description in scenarios:
    scenario_row = results_df[results_df['OOD Type'] == scenario_name]
    if len(scenario_row) > 0:
        acc = scenario_row['Accuracy'].values[0]
        drop = scenario_row['Performance Drop'].values[0]
        conf = scenario_row['Mean Confidence'].values[0]
        
        print(f"\n{scenario_name}:")
        print(f"  Description: {description}")
        print(f"  Accuracy: {acc:.2%} (drop: {drop:.2%})")
        print(f"  Mean Confidence: {conf:.4f}")
        
        if drop < 0.1:
            print(f"  Assessment: Model robust to this scenario ✓")
        elif drop < 0.2:
            print(f"  Assessment: Minor performance degradation ⚠")
        else:
            print(f"  Assessment: Significant performance degradation ✗")

## 7. Summary and Recommendations

Synthesize OOD evaluation findings and provide recommendations.

In [None]:
print("\n" + "="*70)
print("OUT-OF-DISTRIBUTION EVALUATION SUMMARY")
print("="*70)

print(f"\nKey Findings:")
print(f"  1. In-distribution accuracy: {in_dist_accuracy:.2%}")
print(f"  2. Average OOD accuracy: {ood_only['Accuracy'].mean():.2%}")
print(f"  3. Overall robustness score: {robustness_score:.2%}")

print(f"\nMost Robust To:")
most_robust = ood_only.loc[ood_only['Performance Drop'].idxmin()]
print(f"  • {most_robust['OOD Type']}: {most_robust['Accuracy']:.2%}")

print(f"\nLeast Robust To:")
least_robust = ood_only.loc[ood_only['Performance Drop'].idxmax()]
print(f"  • {least_robust['OOD Type']}: {least_robust['Accuracy']:.2%}")

print(f"\nRecommendations for Production Deployment:")
if robustness_score < 0.8:
    print(f"  1. Implement model ensemble for improved robustness")
    print(f"  2. Apply data augmentation to training set (synthetic OOD data)")
    print(f"  3. Consider domain adaptation techniques")
    print(f"  4. Implement input validation and preprocessing normalization")
    print(f"  5. Add confidence-based filtering for low-confidence predictions")
else:
    print(f"  1. Model shows acceptable robustness for deployment")
    print(f"  2. Monitor performance on production data for drift")
    print(f"  3. Implement periodic retraining with new data")
    print(f"  4. Use confidence scores for reliability estimates")

print(f"\n✓ Out-of-distribution evaluation complete!")