# Analyze Model Errors - EDA

This notebook helps you understand **why the model makes mistakes** by:
1. Loading a trained model
2. Running predictions on validation or detection data
3. Finding misclassified samples
4. Visualizing their mel-spectrograms and waveforms

**Use this to:**
- Identify common error patterns
- See what sounds confuse the model
- Decide what hard negatives to collect

## Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install dependencies
!pip install -q librosa soundfile tensorflow matplotlib

print("Setup complete!")

In [None]:
# Import modules
import sys
sys.path.append('/content')  # Adjust if your src/ is elsewhere

import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pandas as pd
import os

import config
import data_loader
import preprocessing
import model as model_module

print("Modules imported successfully!")

## Option 1: Analyze Validation Set Errors

Use this to see errors on clean training data (helps understand if model learned properly)

In [None]:
# Load model
model_path = os.path.join(config.MODEL_SAVE_DIR, 'best_model.h5')
model = model_module.load_trained_model(model_path)

print("Model loaded!")

In [None]:
# Load and prepare validation data
# (You'll need to recreate the train/val split to get the same validation set)

from sklearn.model_selection import train_test_split

print("Loading data...")
species_data = data_loader.load_species_data()
background_data = data_loader.load_background_data()

# Convert to spectrograms
print("\nConverting to spectrograms...")
species_specs = {}
species_files = {}  # Keep track of file paths

for species_name, audio_list in species_data.items():
    specs = []
    files = []
    for audio, filepath in audio_list:
        spec = preprocessing.audio_to_melspectrogram(audio)
        specs.append(spec)
        files.append(filepath)
    species_specs[species_name] = specs
    species_files[species_name] = files

background_specs = []
background_files = []
for audio, filepath in background_data:
    spec = preprocessing.audio_to_melspectrogram(audio)
    background_specs.append(spec)
    background_files.append(filepath)

print("\nPreparing dataset...")
# Combine all data
X_all = []
y_all = []
file_paths = []
labels_text = []

label_map = {name: i for i, name in enumerate(config.CLASS_NAMES[:-1])}
label_map['Background'] = len(label_map)

# Add species data (NO AUGMENTATION for this EDA)
for species_name, specs in species_specs.items():
    for spec, filepath in zip(specs, species_files[species_name]):
        X_all.append(spec)
        y_all.append(label_map[species_name])
        file_paths.append(filepath)
        labels_text.append(species_name)

# Add background
for spec, filepath in zip(background_specs, background_files):
    X_all.append(spec)
    y_all.append(label_map['Background'])
    file_paths.append(filepath)
    labels_text.append('Background')

X_all = np.array(X_all)
y_all = np.array(y_all)

print(f"Total samples: {len(X_all)}")

In [None]:
# Split into train/val (same as training)
X_train, X_val, y_train, y_val, files_train, files_val, labels_train, labels_val = train_test_split(
    X_all, y_all, file_paths, labels_text,
    test_size=config.VALIDATION_SPLIT,
    random_state=config.RANDOM_SEED,
    stratify=y_all
)

print(f"Validation samples: {len(X_val)}")
print(f"Class distribution:")
for i, class_name in enumerate(config.CLASS_NAMES):
    count = np.sum(y_val == i)
    print(f"  {class_name}: {count}")

In [None]:
# Prepare validation data for model input
print("Preprocessing validation data...")
X_val_images = []

for spec in X_val:
    spec_norm = preprocessing.normalize_spectrogram(spec)
    spec_resized = preprocessing.resize_spectrogram(spec_norm)
    rgb = preprocessing.spectrogram_to_rgb(spec_resized)
    model_input = preprocessing.preprocess_for_model(rgb)
    X_val_images.append(model_input)

X_val_images = np.array(X_val_images)
print(f"Shape: {X_val_images.shape}")

In [None]:
# Run predictions
print("Running predictions...")
predictions = model.predict(X_val_images, batch_size=32, verbose=1)
y_pred = np.argmax(predictions, axis=1)

# Find errors
errors_mask = y_pred != y_val
error_indices = np.where(errors_mask)[0]

print(f"\nTotal validation samples: {len(y_val)}")
print(f"Errors: {len(error_indices)}")
print(f"Accuracy: {(1 - len(error_indices)/len(y_val)) * 100:.2f}%")

In [None]:
# Create error analysis dataframe
error_data = []

for idx in error_indices:
    true_label = y_val[idx]
    pred_label = y_pred[idx]
    confidence = predictions[idx, pred_label]
    
    error_data.append({
        'index': idx,
        'file': os.path.basename(files_val[idx]),
        'true_class': config.CLASS_NAMES[true_label],
        'predicted_class': config.CLASS_NAMES[pred_label],
        'confidence': confidence,
        'true_label_num': true_label,
        'pred_label_num': pred_label
    })

errors_df = pd.DataFrame(error_data)
errors_df = errors_df.sort_values('confidence', ascending=False)

print("\nError Summary:")
print(errors_df.head(20))

In [None]:
# Confusion patterns
print("\nMost common confusion patterns:")
confusion_patterns = errors_df.groupby(['true_class', 'predicted_class']).size().reset_index(name='count')
confusion_patterns = confusion_patterns.sort_values('count', ascending=False)
print(confusion_patterns)

## Visualize Error Samples

In [None]:
def visualize_error_sample(idx, show_waveform=True):
    """
    Visualize an error sample: waveform + spectrogram + prediction probabilities
    """
    error_row = errors_df[errors_df['index'] == idx].iloc[0]
    
    # Load audio
    filepath = files_val[idx]
    audio = data_loader.load_audio_file(filepath)
    
    # Get spectrogram
    spec = X_val[idx]
    
    # Get predictions
    pred_probs = predictions[idx]
    
    # Create figure
    if show_waveform:
        fig = plt.figure(figsize=(16, 10))
        gs = fig.add_gridspec(3, 2, height_ratios=[1, 1, 1])
        ax1 = fig.add_subplot(gs[0, :])
        ax2 = fig.add_subplot(gs[1, :])
        ax3 = fig.add_subplot(gs[2, :])
    else:
        fig, (ax2, ax3) = plt.subplots(2, 1, figsize=(16, 8))
    
    # Title
    fig.suptitle(
        f"ERROR SAMPLE: {error_row['file']}\n" +
        f"True: {error_row['true_class']} | Predicted: {error_row['predicted_class']} " +
        f"(confidence: {error_row['confidence']:.3f})",
        fontsize=14, fontweight='bold'
    )
    
    # Plot waveform
    if show_waveform:
        times = np.arange(len(audio)) / config.SAMPLE_RATE
        ax1.plot(times, audio, linewidth=0.5, color='blue')
        ax1.set_xlabel('Time (s)')
        ax1.set_ylabel('Amplitude')
        ax1.set_title('Waveform')
        ax1.grid(True, alpha=0.3)
    
    # Plot spectrogram
    img = librosa.display.specshow(
        spec,
        x_axis='time',
        y_axis='mel',
        sr=config.SAMPLE_RATE,
        hop_length=config.HOP_LENGTH,
        fmin=config.FMIN,
        fmax=config.FMAX,
        ax=ax2,
        cmap='viridis'
    )
    ax2.set_title('Mel-Spectrogram')
    fig.colorbar(img, ax=ax2, format='%+2.0f dB')
    
    # Plot prediction probabilities
    colors = ['green' if i == error_row['true_label_num'] else 
              'red' if i == error_row['pred_label_num'] else 'gray' 
              for i in range(len(config.CLASS_NAMES))]
    
    bars = ax3.bar(config.CLASS_NAMES, pred_probs, color=colors, alpha=0.7)
    ax3.set_ylabel('Probability')
    ax3.set_title('Model Predictions (Green=True label, Red=Predicted label)')
    ax3.set_ylim([0, 1])
    ax3.grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for bar, prob in zip(bars, pred_probs):
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height,
                f'{prob:.3f}',
                ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed info
    print(f"\nFile: {filepath}")
    print(f"True class: {error_row['true_class']}")
    print(f"Predicted: {error_row['predicted_class']} (confidence: {error_row['confidence']:.3f})")
    print(f"\nAll probabilities:")
    for i, (class_name, prob) in enumerate(zip(config.CLASS_NAMES, pred_probs)):
        marker = "<- TRUE" if i == error_row['true_label_num'] else "<- PRED" if i == error_row['pred_label_num'] else ""
        print(f"  {class_name:30s}: {prob:.4f} {marker}")

In [None]:
# Visualize top 10 errors (highest confidence wrong predictions)
print("Visualizing top 10 most confident errors...\n")

for i, row in errors_df.head(10).iterrows():
    visualize_error_sample(row['index'])
    print("\n" + "="*80 + "\n")

## Option 2: Analyze Detection Errors from Long Audio

Use this to see false positives from real-world detection (more relevant for hard negative mining)

In [None]:
# Load a long audio file
long_audio_files = data_loader.get_long_audio_files()

print(f"Available long audio files: {len(long_audio_files)}")
for i, f in enumerate(long_audio_files[:5]):
    print(f"  {i}: {os.path.basename(f)}")

# Choose one
audio_idx = 0  # Change this to select different file
audio_path = long_audio_files[audio_idx]

print(f"\nSelected: {os.path.basename(audio_path)}")

In [None]:
# Load and detect
import detection

print("Running detection...")
detections_df = detection.detect_in_long_audio(
    model,
    audio_path,
    confidence_threshold=0.5  # Lower threshold to see more detections
)

print(f"\nFound {len(detections_df)} detections")
if len(detections_df) > 0:
    print("\nDetection summary:")
    print(detections_df.groupby('species')['confidence'].agg(['count', 'mean', 'min', 'max']))

In [None]:
# Focus on medium-confidence detections (likely false positives)
if len(detections_df) > 0:
    # Filter detections with confidence between 0.5 and 0.85
    uncertain_detections = detections_df[
        (detections_df['confidence'] >= 0.5) & 
        (detections_df['confidence'] <= 0.85)
    ].copy()
    
    print(f"\nUncertain detections (0.5-0.85 confidence): {len(uncertain_detections)}")
    print(uncertain_detections.head(20))
else:
    print("No detections to analyze")

In [None]:
def visualize_detection_sample(audio_path, start_time, end_time, species, confidence):
    """
    Visualize a detected segment from long audio
    """
    # Load full audio
    full_audio, sr = librosa.load(audio_path, sr=config.SAMPLE_RATE)
    
    # Extract segment
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    segment = full_audio[start_sample:end_sample]
    
    # Compute spectrogram
    spec = preprocessing.audio_to_melspectrogram(segment, sr)
    
    # Create figure
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 8))
    
    fig.suptitle(
        f"Detection at {start_time:.1f}-{end_time:.1f}s | " +
        f"Predicted: {species} (confidence: {confidence:.3f})",
        fontsize=14, fontweight='bold'
    )
    
    # Waveform
    times = np.arange(len(segment)) / sr
    ax1.plot(times, segment, linewidth=0.5, color='blue')
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('Amplitude')
    ax1.set_title('Waveform')
    ax1.grid(True, alpha=0.3)
    
    # Spectrogram
    img = librosa.display.specshow(
        spec,
        x_axis='time',
        y_axis='mel',
        sr=sr,
        hop_length=config.HOP_LENGTH,
        fmin=config.FMIN,
        fmax=config.FMAX,
        ax=ax2,
        cmap='viridis'
    )
    ax2.set_title('Mel-Spectrogram')
    fig.colorbar(img, ax=ax2, format='%+2.0f dB')
    
    plt.tight_layout()
    plt.show()
    
    print(f"File: {os.path.basename(audio_path)}")
    print(f"Time: {start_time:.1f} - {end_time:.1f} seconds")
    print(f"Predicted species: {species}")
    print(f"Confidence: {confidence:.3f}")
    print("\nIs this a correct detection? (Look at the spectrogram)")

In [None]:
# Visualize uncertain detections (potential false positives)
if len(uncertain_detections) > 0:
    print(f"Visualizing {min(10, len(uncertain_detections))} uncertain detections...\n")
    
    for i, row in uncertain_detections.head(10).iterrows():
        visualize_detection_sample(
            audio_path,
            row['start_time'],
            row['end_time'],
            row['species'],
            row['confidence']
        )
        print("\n" + "="*80 + "\n")
else:
    print("No uncertain detections to visualize")

## Interactive: Visualize Specific Detection by Index

In [None]:
# Choose a specific detection to examine
detection_idx = 0  # Change this number to look at different detections

if len(detections_df) > detection_idx:
    row = detections_df.iloc[detection_idx]
    visualize_detection_sample(
        audio_path,
        row['start_time'],
        row['end_time'],
        row['species'],
        row['confidence']
    )
else:
    print(f"Index {detection_idx} out of range (total detections: {len(detections_df)})")

## Summary & Insights

After running this notebook, you should be able to answer:

1. **What types of sounds does the model confuse?**
   - Bird calls that look like primate calls?
   - Certain frequency patterns?

2. **Are there visual patterns in error spectrograms?**
   - Do they have similar frequency ranges?
   - Similar temporal patterns?

3. **What should go into hard negatives?**
   - Uncertain detections that are clearly NOT primate calls
   - Sounds with medium confidence (0.5-0.85)

4. **Is the model's confusion systematic?**
   - Does it always confuse Species A with Species B?
   - Does it misclassify background as a specific species?

Use these insights to:
- Collect better hard negatives
- Add more training data for confused classes
- Adjust preprocessing parameters (frequency range, etc.)