# Data Augmentation Impact Comparison

## Overview

This notebook compares model performance **with** and **without** data augmentation to demonstrate the benefits.

### What This Does:

1. **Baseline Model**: Train on original data only (no augmentation)
2. **Augmented Model**: Train with full augmentation pipeline (7√ó multiplier)
3. **Compare Results**: Side-by-side comparison of:
   - Validation accuracy
   - Per-class performance
   - Training curves
   - Confusion matrices

### Expected Outcomes:

- **More training data**: Augmentation increases training samples from ~350 to ~2,500
- **Better accuracy**: Expected improvement of 5-15 percentage points
- **Better generalization**: Reduced overfitting, better real-world performance
- **Balanced performance**: More consistent accuracy across all classes

**‚è±Ô∏è Time Required**: ~1-2 hours (both models need to train)

## Setup

In [None]:
# Install dependencies
!pip install -q librosa soundfile tensorflow scikit-learn pandas matplotlib

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("Setup complete!")

In [None]:
# Import all modules
import config
import data_loader
import preprocessing
import augmentation
import model as model_module
import train

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import tensorflow as tf
from tensorflow import keras

# Print configuration
config.print_config_summary()

## Part 1: Baseline Model (No Augmentation)

Train a model using **only original data** with no augmentation.

In [None]:
print("=" * 70)
print("PART 1: BASELINE MODEL - NO AUGMENTATION")
print("=" * 70)

# Load raw data
species_data = data_loader.load_species_data()
background_data = data_loader.load_background_data()

print("\nüìä Creating dataset WITHOUT augmentation...")

# Prepare dataset without augmentation
X_all = []
y_all = []

# Label mapping
label_map = {species: i for i, species in enumerate(config.CLASS_NAMES[:-1])}
label_map['Background'] = len(label_map)

# Process species
for species_name, audio_list in species_data.items():
    print(f"   Processing {species_name}...")
    species_label = label_map[species_name]
    
    for audio, _ in audio_list:
        mel_spec = preprocessing.audio_to_melspectrogram(audio)
        spec_norm = preprocessing.normalize_spectrogram(mel_spec)
        spec_resized = preprocessing.resize_spectrogram(spec_norm)
        rgb_image = preprocessing.spectrogram_to_rgb(spec_resized)
        
        X_all.append(rgb_image)
        y_all.append(species_label)

# Process background
print("   Processing Background...")
for audio, _ in background_data:
    mel_spec = preprocessing.audio_to_melspectrogram(audio)
    spec_norm = preprocessing.normalize_spectrogram(mel_spec)
    spec_resized = preprocessing.resize_spectrogram(spec_norm)
    rgb_image = preprocessing.spectrogram_to_rgb(spec_resized)
    
    X_all.append(rgb_image)
    y_all.append(label_map['Background'])

# Convert to arrays and normalize
X_all = np.array(X_all)
y_all = np.array(y_all)
X_all = preprocessing.preprocess_for_model(X_all)

print(f"\n Total samples: {len(X_all)}")

# Split into train/val
X_train_base, X_val_base, y_train_base, y_val_base = train_test_split(
    X_all, y_all,
    test_size=config.VALIDATION_SPLIT,
    random_state=config.RANDOM_SEED,
    stratify=y_all
)

print(f"\n   Training: {len(X_train_base)} samples")
print(f"   Validation: {len(X_val_base)} samples")

# Show distribution
print("\n   Class Distribution:")
for i, name in enumerate(config.CLASS_NAMES):
    count = np.sum(y_all == i)
    print(f"      {name}: {count} samples")

In [None]:
# Train baseline model
print("\n Training Baseline Model...\n")

model_baseline = model_module.create_and_compile_model()

# Calculate class weights
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(y_train_base)
weights = compute_class_weight('balanced', classes=classes, y=y_train_base)
class_weights_base = dict(enumerate(weights))

# Callbacks
callbacks_base = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=config.PATIENCE,
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        verbose=1
    )
]

# Train
history_baseline = model_baseline.fit(
    X_train_base, y_train_base,
    validation_data=(X_val_base, y_val_base),
    epochs=config.EPOCHS,
    batch_size=config.BATCH_SIZE,
    class_weight=class_weights_base,
    callbacks=callbacks_base,
    verbose=1
)

print("\n Baseline model training complete!")

In [None]:
# Evaluate baseline
print("\n  Baseline Model Evaluation:\n")
results_baseline = model_baseline.evaluate(X_val_base, y_val_base, verbose=0)

print(f"   Validation Loss: {results_baseline[0]:.4f}")
print(f"   Validation Accuracy: {results_baseline[1]:.4f}")
print(f"   Top-2 Accuracy: {results_baseline[2]:.4f}")

## Part 2: Augmented Model (With Data Augmentation)

Train a model using the **full augmentation pipeline** (7√ó multiplier).

In [None]:

# Use the standard training pipeline with augmentation
X_train_aug, X_val_aug, y_train_aug, y_val_aug, _ = train.prepare_dataset()

print(f"   Training: {len(X_train_aug)} samples")
print(f"   Validation: {len(X_val_aug)} samples")
print(f"\n   Data multiplier: {len(X_train_aug) / len(X_train_base):.1f}x")

In [None]:
# Train augmented model
print("\n Training Augmented Model...\n")

model_augmented = model_module.create_and_compile_model()

# Calculate class weights
classes = np.unique(y_train_aug)
weights = compute_class_weight('balanced', classes=classes, y=y_train_aug)
class_weights_aug = dict(enumerate(weights))

# Callbacks
callbacks_aug = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=config.PATIENCE,
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        verbose=1
    )
]

# Train
history_augmented = model_augmented.fit(
    X_train_aug, y_train_aug,
    validation_data=(X_val_aug, y_val_aug),
    epochs=config.EPOCHS,
    batch_size=config.BATCH_SIZE,
    class_weight=class_weights_aug,
    callbacks=callbacks_aug,
    verbose=1
)


In [None]:
# Evaluate augmented
print("\n Augmented Model Evaluation:\n")
results_augmented = model_augmented.evaluate(X_val_aug, y_val_aug, verbose=0)

print(f"   Validation Loss: {results_augmented[0]:.4f}")
print(f"   Validation Accuracy: {results_augmented[1]:.4f}")
print(f"   Top-2 Accuracy: {results_augmented[2]:.4f}")

## Part 3: Comprehensive Comparison

Compare both models side-by-side.

In [None]:

print(f"{'Metric':<25s} {'Baseline':>15s} {'Augmented':>15s} {'Improvement':>15s}")

# Accuracy
acc_diff = (results_augmented[1] - results_baseline[1]) * 100
print(f"{'Validation Accuracy':<25s} {results_baseline[1]:>14.4f} {results_augmented[1]:>14.4f} {acc_diff:>+14.2f}pp")

# Loss
loss_diff = ((results_baseline[0] - results_augmented[0]) / results_baseline[0]) * 100
print(f"{'Validation Loss':<25s} {results_baseline[0]:>14.4f} {results_augmented[0]:>14.4f} {loss_diff:>+14.2f}%")

# Top-2 accuracy
top2_diff = (results_augmented[2] - results_baseline[2]) * 100
print(f"{'Top-2 Accuracy':<25s} {results_baseline[2]:>14.4f} {results_augmented[2]:>14.4f} {top2_diff:>+14.2f}pp")

# Training samples
print(f"{'Training Samples':<25s} {len(X_train_base):>14d} {len(X_train_aug):>14d} {((len(X_train_aug)/len(X_train_base)-1)*100):>+14.1f}%")



In [None]:
# Per-class comparison
print("\n Per-Class Performance:\n")

y_pred_base = model_baseline.predict(X_val_base, verbose=0)
y_pred_base_classes = np.argmax(y_pred_base, axis=1)

y_pred_aug = model_augmented.predict(X_val_aug, verbose=0)
y_pred_aug_classes = np.argmax(y_pred_aug, axis=1)

print(f"{'Class':<30s} {'Baseline Acc':>15s} {'Aug Acc':>15s} {'Improvement':>15s}")
print("-" * 75)

for i, class_name in enumerate(config.CLASS_NAMES):
    # Baseline
    mask_base = y_val_base == i
    if np.sum(mask_base) > 0:
        acc_base = np.sum((y_pred_base_classes == i) & mask_base) / np.sum(mask_base)
    else:
        acc_base = 0
    
    # Augmented
    mask_aug = y_val_aug == i
    if np.sum(mask_aug) > 0:
        acc_aug = np.sum((y_pred_aug_classes == i) & mask_aug) / np.sum(mask_aug)
    else:
        acc_aug = 0
    
    improvement = (acc_aug - acc_base) * 100
    
    print(f"{class_name:<30s} {acc_base:>14.2%} {acc_aug:>14.2%} {improvement:>+14.2f}pp")

In [None]:
# Visualization: Training curves comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Accuracy
axes[0].plot(history_baseline.history['accuracy'], label='Baseline Train', linestyle='--', alpha=0.7, linewidth=2)
axes[0].plot(history_baseline.history['val_accuracy'], label='Baseline Val', linestyle='--', alpha=0.7, linewidth=2)
axes[0].plot(history_augmented.history['accuracy'], label='Augmented Train', linewidth=2.5)
axes[0].plot(history_augmented.history['val_accuracy'], label='Augmented Val', linewidth=2.5)
axes[0].set_title('Training Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3)

# Loss
axes[1].plot(history_baseline.history['loss'], label='Baseline Train', linestyle='--', alpha=0.7, linewidth=2)
axes[1].plot(history_baseline.history['val_loss'], label='Baseline Val', linestyle='--', alpha=0.7, linewidth=2)
axes[1].plot(history_augmented.history['loss'], label='Augmented Train', linewidth=2.5)
axes[1].plot(history_augmented.history['val_loss'], label='Augmented Val', linewidth=2.5)
axes[1].set_title('Training Loss Comparison', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Loss', fontsize=12)
axes[1].legend(fontsize=10)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/chimp-audio/outputs/visualizations/comparison_curves.png', 
            dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Visualization: Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Baseline confusion matrix
cm_base = confusion_matrix(y_val_base, y_pred_base_classes)
cm_base_norm = cm_base.astype('float') / cm_base.sum(axis=1)[:, np.newaxis]

im1 = axes[0].imshow(cm_base_norm, interpolation='nearest', cmap='Blues')
axes[0].set_title('Baseline Model - Confusion Matrix', fontsize=14, fontweight='bold')

tick_marks = np.arange(len(config.CLASS_NAMES))
axes[0].set_xticks(tick_marks)
axes[0].set_yticks(tick_marks)
axes[0].set_xticklabels([c[:10] for c in config.CLASS_NAMES], rotation=45, ha='right')
axes[0].set_yticklabels([c[:10] for c in config.CLASS_NAMES])

thresh = cm_base_norm.max() / 2.
for i in range(len(config.CLASS_NAMES)):
    for j in range(len(config.CLASS_NAMES)):
        axes[0].text(j, i, f'{cm_base_norm[i, j]:.2f}',
                    ha="center", va="center",
                    color="white" if cm_base_norm[i, j] > thresh else "black")

plt.colorbar(im1, ax=axes[0])

# Augmented confusion matrix
cm_aug = confusion_matrix(y_val_aug, y_pred_aug_classes)
cm_aug_norm = cm_aug.astype('float') / cm_aug.sum(axis=1)[:, np.newaxis]

im2 = axes[1].imshow(cm_aug_norm, interpolation='nearest', cmap='Greens')
axes[1].set_title('Augmented Model - Confusion Matrix', fontsize=14, fontweight='bold')

axes[1].set_xticks(tick_marks)
axes[1].set_yticks(tick_marks)
axes[1].set_xticklabels([c[:10] for c in config.CLASS_NAMES], rotation=45, ha='right')
axes[1].set_yticklabels([c[:10] for c in config.CLASS_NAMES])

thresh = cm_aug_norm.max() / 2.
for i in range(len(config.CLASS_NAMES)):
    for j in range(len(config.CLASS_NAMES)):
        axes[1].text(j, i, f'{cm_aug_norm[i, j]:.2f}',
                    ha="center", va="center",
                    color="white" if cm_aug_norm[i, j] > thresh else "black")

plt.colorbar(im2, ax=axes[1])

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/chimp-audio/outputs/visualizations/comparison_confusion.png', 
            dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Save models
import os

model_baseline.save(os.path.join(config.MODEL_SAVE_DIR, 'baseline_no_augmentation.h5'))
model_augmented.save(os.path.join(config.MODEL_SAVE_DIR, 'model_with_augmentation.h5'))

print(f"   Baseline: {config.MODEL_SAVE_DIR}/baseline_no_augmentation.h5")
print(f"   Augmented: {config.MODEL_SAVE_DIR}/model_with_augmentation.h5")