# CNN Development: End-to-End Spectrogram Classification

This notebook implements a Convolutional Neural Network (CNN) that processes raw spectrograms for music genre classification, demonstrating end-to-end learning from time-frequency representations.

In [None]:
import sys
sys.path.insert(0, '../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import librosa
import librosa.display
import torch
import torch.nn as nn

from models.cnn_model import AudioCNN, CNNTrainer, create_spectrogram_dataset

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

## 1. Generate Sample Spectrogram Data

Create demonstration spectrograms for model training.

In [None]:
# Create synthetic spectrogram dataset for demonstration
n_samples = 2000
n_genres = 8
genres = ['rock', 'electronic', 'hip-hop', 'classical', 'jazz', 'folk', 'pop', 'experimental']

# Spectrogram shape: (frequency bins, time frames)
spec_height = 128  # Frequency bins
spec_width = 128   # Time frames

print(f"Generating {n_samples} synthetic spectrograms...")
print(f"Spectrogram shape: ({spec_height}, {spec_width})")

# Generate spectrograms with genre-specific characteristics
spectrograms = np.zeros((n_samples, spec_height, spec_width))
labels = np.zeros(n_samples, dtype=int)

for i in range(n_samples):
    genre_idx = i % n_genres
    labels[i] = genre_idx
    
    # Create genre-specific spectrogram patterns
    spec = np.random.randn(spec_height, spec_width) * 0.1
    
    if genre_idx == 0:  # rock
        spec[40:80, :] += np.random.uniform(0.5, 1.5, (40, spec_width))
    elif genre_idx == 1:  # electronic
        spec[60:100, :] += np.random.uniform(0.8, 1.5, (40, spec_width))
    elif genre_idx == 2:  # hip-hop
        spec[20:60, :] += np.random.uniform(0.6, 1.2, (40, spec_width))
    elif genre_idx == 3:  # classical
        spec[:, :] += np.linspace(0.1, 1.0, spec_width) * 0.5
    elif genre_idx == 4:  # jazz
        spec[30:100, :] += np.random.uniform(0.3, 1.2, (70, spec_width))
    elif genre_idx == 5:  # folk
        spec[50:110, :] += np.random.uniform(0.4, 1.0, (60, spec_width))
    elif genre_idx == 6:  # pop
        spec[50:90, :] += np.random.uniform(0.7, 1.3, (40, spec_width))
    elif genre_idx == 7:  # experimental
        spec[:, :] += np.abs(np.random.randn(spec_height, spec_width)) * 0.3
    
    spectrograms[i] = np.abs(spec)

print(f"Generated spectrograms shape: {spectrograms.shape}")
print(f"Labels distribution:\n{pd.Series(labels).value_counts().sort_index()}")

# Normalize spectrograms
spectrograms = (spectrograms - spectrograms.mean()) / (spectrograms.std() + 1e-8)

## 2. Visualize Sample Spectrograms

Examine spectrograms for different genres.

In [None]:
# Visualize sample spectrograms
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
fig.suptitle('Sample Spectrograms by Genre', fontsize=16, fontweight='bold')

for genre_idx in range(n_genres):
    ax = axes[genre_idx // 4, genre_idx % 4]
    sample_idx = np.where(labels == genre_idx)[0][0]
    
    im = ax.imshow(spectrograms[sample_idx], aspect='auto', origin='lower', cmap='viridis')
    ax.set_title(genres[genre_idx], fontweight='bold')
    ax.set_xlabel('Time Frames')
    ax.set_ylabel('Frequency Bins')
    plt.colorbar(im, ax=ax)

plt.tight_layout()
plt.savefig('../outputs/cnn_sample_spectrograms.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Create Data Loaders

Prepare training, validation, and test sets with stratification.

In [None]:
# Stratified split: 60% train, 20% val, 20% test
train_idx, temp_idx = train_test_split(
    np.arange(n_samples), test_size=0.4, stratify=labels, random_state=42
)
val_idx, test_idx = train_test_split(
    temp_idx, test_size=0.5, stratify=labels[temp_idx], random_state=42
)

print(f"Train samples: {len(train_idx)}")
print(f"Val samples: {len(val_idx)}")
print(f"Test samples: {len(test_idx)}")

# Create data loaders
train_loader, val_loader, test_loader = create_spectrogram_dataset(
    spectrograms, labels, train_idx, val_idx, test_idx
)

print(f"\nTrain loader batches: {len(train_loader)}")
print(f"Val loader batches: {len(val_loader)}")
print(f"Test loader batches: {len(test_loader)}")

## 4. Initialize and Train CNN Model

Build and train the CNN architecture.

In [None]:
# Initialize model
model = AudioCNN(num_genres=n_genres, input_channels=1)
print("Model architecture:")
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")

# Initialize trainer
trainer = CNNTrainer(model, device=device, learning_rate=0.001, weight_decay=1e-5)

# Train model
history = trainer.train(
    train_loader, val_loader, epochs=50, patience=10
)

## 5. Visualize Training History

Plot training and validation curves.

In [None]:
trainer.plot_history()

## 6. Evaluate on Test Set

Comprehensive evaluation with confusion matrix and metrics.

In [None]:
# Evaluate on test set
predictions, true_labels = trainer.evaluate(test_loader, genres)

# Calculate metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

accuracy = accuracy_score(true_labels, predictions)
f1_weighted = f1_score(true_labels, predictions, average='weighted')
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')

print(f"\n{'='*70}")
print(f"CNN TEST SET PERFORMANCE")
print(f"{'='*70}")
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1-Score:  {f1_weighted:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")

## 7. Comparative Analysis

Compare CNN with baseline models.

In [None]:
# Flatten spectrograms for traditional ML models
specs_flat = spectrograms.reshape(n_samples, -1)

X_train_flat = specs_flat[train_idx]
X_val_flat = specs_flat[val_idx]
X_test_flat = specs_flat[test_idx]

y_train = labels[train_idx]
y_val = labels[val_idx]
y_test = labels[test_idx]

# Train Random Forest for comparison
from sklearn.ensemble import RandomForestClassifier

print("Training Random Forest for comparison...")
rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_model.fit(X_train_flat, y_train)

rf_pred = rf_model.predict(X_test_flat)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred, average='weighted')

print(f"\nRandom Forest Accuracy:  {rf_accuracy:.4f}")
print(f"Random Forest F1-Score:  {rf_f1:.4f}")

# Comparison table
comparison_df = pd.DataFrame({
    'Model': ['CNN', 'Random Forest'],
    'Accuracy': [accuracy, rf_accuracy],
    'F1-Score': [f1_weighted, rf_f1],
    'Precision': [precision, precision_score(y_test, rf_pred, average='weighted')],
    'Recall': [recall, recall_score(y_test, rf_pred, average='weighted')]
})

print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(comparison_df.to_string(index=False))

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison_df))
width = 0.2
metrics = ['Accuracy', 'F1-Score', 'Precision', 'Recall']

for i, metric in enumerate(metrics):
    ax.bar(x + i*width, comparison_df[metric], width, label=metric, alpha=0.8)

ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('CNN vs Random Forest: Performance Comparison')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(comparison_df['Model'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('../outputs/cnn_vs_rf_comparison.png', dpi=300, bbox_inches='tight')
plt.show()