# 🎵 Urban Sound Classification using Deep Learning

**Project:** CSC4025Z Assignment 2 - Neural Network Application  
**Dataset:** Urban Sound 8K (8,732 audio samples, 10 classes)  
**Task:** Multi-class audio classification  

---

## Project Overview

This notebook implements and compares two approaches for urban sound classification:

1. **Baseline:** Naive Bayes Classifier
2. **Neural Network:** Artificial Neural Network (ANN)

---

## Dataset: Urban Sound 8K

**Classes (10):**
1. air_conditioner
2. car_horn
3. children_playing
4. dog_bark
5. drilling
6. engine_idling
7. gun_shot
8. jackhammer
9. siren
10. street_music

**Samples:** 8,732 audio files (4 seconds each)  

---
# Setting Up
---

### SYSTEM INFORMATION

In [None]:
import torch
import sys

# Check Python version
print(f"Python version: {sys.version}")

# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Check CUDA availability
if torch.cuda.is_available():
    print(f"GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    device = torch.device('cuda')
else:
    print("WARNING: GPU not available, using CPU")
    device = torch.device('cpu')

print(f"\nUsing device: {device}")

In [None]:
import os

PROJECT_DIR = '/kaggle/working/UrbanSound_Project'
os.makedirs(PROJECT_DIR, exist_ok=True)

# Create subdirectories
os.makedirs(f'{PROJECT_DIR}/models', exist_ok=True)
os.makedirs(f'{PROJECT_DIR}/results', exist_ok=True)
os.makedirs(f'{PROJECT_DIR}/figures', exist_ok=True)

print(f"Project directory: {PROJECT_DIR}")
print("Subdirectories created")

### IMPORTING LIBRARIES

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import json
import pickle
import time
from pathlib import Path

# Audio processing
import librosa
import librosa.display
import soundfile as sf
import IPython.display as ipd

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# Deep Learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Seting random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Plotting settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("All libraries imported successfully!")
print(f"Random seed set to: {RANDOM_SEED}")
print(f"Device: {device}")

### GLOBAL CONFIGURATION

In [None]:
# Configuration dictionary
CONFIG = {
    'project_dir': PROJECT_DIR,
    'data_dir': '/kaggle/input/urbansound8k',  
    'base_dir': '/kaggle/input/urbansound8k',
    'audio_dir': '/kaggle/input/urbansound8k',
    'metadata_path': '/kaggle/input/urbansound8k/UrbanSound8K.csv',
    'models_dir': f'{PROJECT_DIR}/models',
    'results_dir': f'{PROJECT_DIR}/results',
    'figures_dir': f'{PROJECT_DIR}/figures',

    # Audio parameters
    'sample_rate': 22050,
    'duration': 4,
    'n_mfcc': 120,  # Based on Barua et al. (2023)

    # Dataset parameters
    'n_classes': 10,
    'class_names': [
        'air_conditioner',
        'car_horn',
        'children_playing',
        'dog_bark',
        'drilling',
        'engine_idling',
        'gun_shot',
        'jackhammer',
        'siren',
        'street_music'
    ],

    # Training parameters
    'train_split': 0.70,
    'val_split': 0.10,
    'test_split': 0.20,
    'batch_size': 400,  # Based on Barua et al. (2023)
    'learning_rate': 0.001, #Start value
    'num_epochs': 100,  # Start value
    'early_stopping_patience': 20,

    # Model architecture (Based on Barua et al. 2023)
    'hidden_layers': [1000, 750, 500, 250, 100],
    'dropout_rate': 0.5,

    # Random seed
    'random_seed': RANDOM_SEED,
    'device': str(device)
}

# Save configuration
with open(f"{CONFIG['results_dir']}/config.json", 'w') as f:
    json.dump(CONFIG, f, indent=4)

# Display configuration
print("\nConfiguration:")
for key, value in CONFIG.items():
    if key != 'class_names':  # Skip long list
        print(f"   {key}: {value}")
    else:
        print(f"   {key}: {len(value)} classes")

print(f"\nConfiguration saved to: {CONFIG['results_dir']}/config.json")

---

# Dataset Loading & Exploration

---

### VERIFYING DATASET SETUP

In [None]:
# Check dataset location
dataset_path = CONFIG['base_dir']
metadata_path = CONFIG['metadata_path']

# Check folds
if os.path.exists(dataset_path):
    items = os.listdir(dataset_path)
    folds = sorted([f for f in items if f.startswith('fold') and os.path.isdir(f"{dataset_path}/{f}")])
    
    print(f"\nFound {len(folds)} fold directories:")
    
    total_files = 0
    for fold in folds:
        fold_path = os.path.join(dataset_path, fold)
        wav_files = [f for f in os.listdir(fold_path) if f.endswith('.wav')]
        total_files += len(wav_files)
        print(f"   {fold}: {len(wav_files)} files")
    
    print(f"\nTotal audio files: {total_files}")
    
    if total_files == 8732:
        print("All 8,732 files present")
    else:
        print(f"Warning: Expected 8,732, found {total_files}")
else:
    print(f"ERROR: Dataset not found at {dataset_path}")
    print("Please ensure you added the urbansound8k dataset to your notebook")

# Check metadata
if os.path.exists(metadata_path):
    print(f"\nMetadata file found: UrbanSound8K.csv")
else:
    print(f"\nERROR: Metadata not found at {metadata_path}")

print("\n" + "="*70)
if os.path.exists(dataset_path) and os.path.exists(metadata_path):
    print("DATASET VERIFIED AND READY!")
else:
    print("DATASET SETUP INCOMPLETE - Check errors above")
print("="*70)

### LOADING METADATA

In [None]:
# Load metadata CSV
metadata = pd.read_csv(CONFIG['metadata_path'])

print(f"\nMetadata shape: {metadata.shape}")
print(f"Total samples: {len(metadata)}")

print("\nFirst 5 rows:")
print(metadata.head())

print("\nColumn names:")
print(metadata.columns.tolist())

print("\nColumn data types:")
print(metadata.dtypes)

print("\nClass names in dataset:")
print(metadata['class'].unique())

# Store in CONFIG for later use
CONFIG['metadata'] = metadata

print("\n" + "="*70)
print("Metadata loaded successfully!")
print("="*70)

### DATASET STATISTICS

In [None]:
# Basic statistics
print(f"\nTotal number of audio samples: {len(metadata)}")
print(f"Number of classes: {metadata['classID'].nunique()}")
print(f"Number of folds: {metadata['fold'].nunique()}")

# Class distribution
print("\nClass Distribution:")
print("-"*70)
class_counts = metadata['class'].value_counts().sort_index()
for class_name, count in class_counts.items():
    percentage = (count / len(metadata)) * 100
    print(f"   {class_name:20s}: {count:4d} samples ({percentage:5.2f}%)")

print("-"*70)
print(f"   {'TOTAL':20s}: {len(metadata):4d} samples (100.00%)")

# Check for class imbalance
max_count = class_counts.max()
min_count = class_counts.min()
imbalance_ratio = max_count / min_count
print(f"\nClass Imbalance Ratio: {imbalance_ratio:.2f}")
if imbalance_ratio < 1.5:
    print("Dataset is relatively balanced (ratio < 1.5)")
elif imbalance_ratio < 2.0:
    print("Dataset shows moderate imbalance (ratio < 2.0)")
else:
    print("Dataset shows significant imbalance (ratio >= 2.0)")

# Fold distribution
print("\nSamples per fold:")
fold_counts = metadata['fold'].value_counts().sort_index()
for fold, count in fold_counts.items():
    print(f"   fold{fold}: {count} samples")

### VISUALIZING CLASS DISTRIBUTION

In [None]:
# Create figure
fig, ax = plt.subplots(figsize=(12, 6))

# Get class counts
class_counts = metadata['class'].value_counts().sort_index()

# Create bar plot
bars = ax.bar(range(len(class_counts)), class_counts.values, color='steelblue', alpha=0.8)

# Customize plot
ax.set_xlabel('Class Name', fontsize=12, fontweight='bold')
ax.set_ylabel('Number of Samples', fontsize=12, fontweight='bold')
ax.set_title('Urban Sound 8K - Class Distribution', fontsize=14, fontweight='bold')
ax.set_xticks(range(len(class_counts)))
ax.set_xticklabels(class_counts.index, rotation=45, ha='right')
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontsize=10)

# Add horizontal line for mean
mean_samples = class_counts.mean()
ax.axhline(y=mean_samples, color='red', linestyle='--', alpha=0.5, label=f'Mean: {mean_samples:.0f}')
ax.legend()

plt.tight_layout()

# Save figure
fig_path = f"{CONFIG['figures_dir']}/class_distribution.png"
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f"\nFigure saved to: {fig_path}")

plt.show()

### SELECTING SAMPLE AUDIO FILES

In [None]:
# Select one sample from each class
sample_files = []
for class_name in CONFIG['class_names']:
    # Get first sample of this class
    class_samples = metadata[metadata['class'] == class_name]
    if len(class_samples) > 0:
        sample = class_samples.iloc[0]
        sample_files.append(sample)

print(f"\nSelected {len(sample_files)} sample audio files (one per class)")
print("\nSample details:")
print("-"*70)

for i, sample in enumerate(sample_files):
    fold = sample['fold']
    filename = sample['slice_file_name']
    class_name = sample['class']
    class_id = sample['classID']
    
    # Build full path
    audio_path = f"{CONFIG['audio_dir']}/fold{fold}/{filename}"
    
    # Verify file exists
    exists = os.path.exists(audio_path)
    status = "OK" if exists else "NOT FOUND"
    
    print(f"{i+1:2d}. Class: {class_name:20s} | ID: {class_id} | File: {filename:30s} | {status}")

print("-"*70)
print(f"All sample files verified!")
print("="*70)

# Store samples for later use
CONFIG['sample_files'] = sample_files

### VISUALIZING SAMPLE WAVEFORMS

In [None]:
# Create figure with subplots
fig, axes = plt.subplots(5, 2, figsize=(15, 12))
axes = axes.ravel()

print("\nGenerating waveform plots for all 10 classes...")
print("This may take 1-2 minutes...")

for i, sample in enumerate(CONFIG['sample_files']):
    fold = sample['fold']
    filename = sample['slice_file_name']
    class_name = sample['class']
    audio_path = f"{CONFIG['audio_dir']}/fold{fold}/{filename}"
    
    # Load audio
    signal, sr = librosa.load(audio_path, sr=CONFIG['sample_rate'], duration=CONFIG['duration'])
    
    # Create time axis
    time = np.linspace(0, len(signal)/sr, len(signal))
    
    # Plot waveform
    axes[i].plot(time, signal, linewidth=0.5, color='blue', alpha=0.7)
    axes[i].set_title(f"{i+1}. {class_name}", fontsize=10, fontweight='bold')
    axes[i].set_xlabel('Time (s)', fontsize=8)
    axes[i].set_ylabel('Amplitude', fontsize=8)
    axes[i].grid(alpha=0.3)
    axes[i].set_xlim([0, CONFIG['duration']])

plt.tight_layout()

# Save figure
fig_path = f"{CONFIG['figures_dir']}/sample_waveforms.png"
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f"\nFigure saved to: {fig_path}")

plt.show()

### AUDIO PLAYBACK

In [None]:
# Play samples
for i, sample in enumerate(CONFIG['sample_files']):
    fold = sample['fold']
    filename = sample['slice_file_name']
    class_name = sample['class']
    audio_path = f"{CONFIG['audio_dir']}/fold{fold}/{filename}"
    
    print(f"\n{i+1}. Class: {class_name}")
    print(f"   File: {filename}")
    display(ipd.Audio(audio_path))

---
 # Feature Extraction
 ___

### EXTRACTING MFCC FEATURES FROM ALL AUDIO FILES

In [None]:
# Prepare containers
features = []
labels = []

print(f"\nExtracting MFCC features (n_mfcc={CONFIG['n_mfcc']})")
print(f"Processing {len(metadata)} audio files...")
print("This will take 10-15 minutes")

# Loop through all audio files
for idx, row in tqdm(metadata.iterrows(), total=len(metadata)):
    fold = row['fold']
    filename = row['slice_file_name']
    class_id = row['classID']
    
    # Build audio path
    audio_path = f"{CONFIG['audio_dir']}/fold{fold}/{filename}"
    
    try:
        # Load audio
        signal, sr = librosa.load(audio_path, sr=CONFIG['sample_rate'], duration=CONFIG['duration'])
        
        # Extract MFCC
        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=CONFIG['n_mfcc'])
        
        # Average across time to get (120,) features
        mfcc_mean = np.mean(mfcc, axis=1)
        
        features.append(mfcc_mean)
        labels.append(class_id)
        
    except Exception as e:
        print(f"\nError processing {filename}: {e}")
        continue

# Convert to numpy arrays
features = np.array(features)
labels = np.array(labels)

print("\n" + "-"*70)
print(f"Feature extraction complete!")
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")

# Save features
features_path = f"{CONFIG['project_dir']}/features.npy"
labels_path = f"{CONFIG['project_dir']}/labels.npy"

np.save(features_path, features)
np.save(labels_path, labels)

print(f"\nFeatures saved to: {features_path}")
print(f"Labels saved to: {labels_path}")

### ORGANIZING DATA BY FOLDS FOR 10-FOLD CROSS-VALIDATION

In [None]:
# Map each sample to its fold using metadata
fold_mapping = {}
for idx, row in metadata.iterrows():
    fold_mapping[idx] = row['fold']

# Create fold-organized data structure
folds_data = {i: {'indices': [], 'features': [], 'labels': []} for i in range(1, 11)}

for idx in range(len(features)):
    fold_num = fold_mapping[idx]
    folds_data[fold_num]['indices'].append(idx)
    folds_data[fold_num]['features'].append(features[idx])
    folds_data[fold_num]['labels'].append(labels[idx])

# Convert to numpy arrays
for fold_num in range(1, 11):
    folds_data[fold_num]['features'] = np.array(folds_data[fold_num]['features'])
    folds_data[fold_num]['labels'] = np.array(folds_data[fold_num]['labels'])

# Display fold sizes
print("\nSamples per fold:")
print("-"*70)
for fold_num in range(1, 11):
    n_samples = len(folds_data[fold_num]['features'])
    print(f"   Fold {fold_num:2d}: {n_samples:4d} samples")
print("-"*70)
print(f"   Total:  {sum(len(folds_data[i]['features']) for i in range(1, 11)):4d} samples")

# Store in CONFIG
CONFIG['folds_data'] = folds_data

### CREATING FOLD SPLIT FUNCTION

In [None]:
def get_fold_split(test_fold, val_fold):
    """
    Get train/val/test split for a given test and validation fold.
    
    Args:
        test_fold: Fold number to use as test set (1-10)
        val_fold: Fold number to use as validation set (1-10, must be != test_fold)
    
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test, scaler
    """
    assert test_fold != val_fold, "Test and validation folds must be different!"
    assert 1 <= test_fold <= 10, "Test fold must be 1-10"
    assert 1 <= val_fold <= 10, "Validation fold must be 1-10"
    
    # Get test data
    X_test = folds_data[test_fold]['features']
    y_test = folds_data[test_fold]['labels']
    
    # Get validation data
    X_val = folds_data[val_fold]['features']
    y_val = folds_data[val_fold]['labels']
    
    # Get training data (all other folds)
    X_train_list = []
    y_train_list = []
    
    for fold_num in range(1, 11):
        if fold_num != test_fold and fold_num != val_fold:
            X_train_list.append(folds_data[fold_num]['features'])
            y_train_list.append(folds_data[fold_num]['labels'])
    
    X_train = np.vstack(X_train_list)
    y_train = np.concatenate(y_train_list)
    
    # Normalize features 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    return X_train, X_val, X_test, y_train, y_val, y_test, scaler

# Test the function
print("\nTesting fold split function...")
X_train, X_val, X_test, y_train, y_val, y_test, scaler = get_fold_split(test_fold=10, val_fold=9)

print(f"\nExample split (test=fold10, val=fold9):")
print(f"   Training samples:   {len(X_train)}")
print(f"   Validation samples: {len(X_val)}")
print(f"   Test samples:       {len(X_test)}")
print(f"   Total:              {len(X_train) + len(X_val) + len(X_test)}")

# Store function in CONFIG
CONFIG['get_fold_split'] = get_fold_split

---
# Baseline Model (NAIVE BAYES) 
---

### TRAINING NAIVE BAYES BASELINE - 10-FOLD CROSS-VALIDATION

In [None]:
# Store results for each fold
baseline_results = {
    'fold_accuracies': [],
    'fold_times': [],
    'fold_details': []
}

print("\nRunning 10-fold cross-validation...")
print("This will take ~5-10 minutes")
print("-"*70)

# Loop through all 10 folds
for test_fold in range(1, 11):
    # Use next fold as validation (wrap around)
    val_fold = (test_fold % 10) + 1
    
    print(f"\nFold {test_fold}/10: Test=fold{test_fold}, Val=fold{val_fold}")
    
    # Get data split
    X_train, X_val, X_test, y_train, y_val, y_test, scaler = get_fold_split(test_fold, val_fold)
    
    # Train model
    nb_model = GaussianNB()
    start_time = time.time()
    nb_model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Predict on validation
    y_pred_val = nb_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    
    # Predict on test
    y_pred_test = nb_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    # Calculate detailed metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_test, average='macro')
    
    # Store results
    baseline_results['fold_accuracies'].append(test_accuracy)
    baseline_results['fold_times'].append(train_time)
    baseline_results['fold_details'].append({
        'test_fold': test_fold,
        'val_fold': val_fold,
        'val_accuracy': val_accuracy,
        'test_accuracy': test_accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'train_time': train_time
    })
    
    print(f"   Val Acc: {val_accuracy*100:.2f}% | Test Acc: {test_accuracy*100:.2f}% | Time: {train_time:.2f}s")

# Calculate aggregate statistics
mean_accuracy = np.mean(baseline_results['fold_accuracies'])
std_accuracy = np.std(baseline_results['fold_accuracies'])
total_time = sum(baseline_results['fold_times'])

print("\n" + "="*70)
print("BASELINE RESULTS (10-FOLD CROSS-VALIDATION)")
print("="*70)
print(f"\nTest Accuracy: {mean_accuracy*100:.2f}% ± {std_accuracy*100:.2f}%")
print(f"Total training time: {total_time:.2f} seconds")
print(f"\nPer-fold accuracies:")
for i, acc in enumerate(baseline_results['fold_accuracies'], 1):
    print(f"   Fold {i:2d}: {acc*100:.2f}%")

# Store in CONFIG
CONFIG['baseline_results'] = baseline_results
CONFIG['baseline_mean_accuracy'] = mean_accuracy
CONFIG['baseline_std_accuracy'] = std_accuracy

### SAVING BASELINE RESULTS

In [None]:
# Prepare results for saving (convert numpy types to python types)
baseline_summary = {
    'model': 'Naive Bayes (GaussianNB)',
    'evaluation': '10-Fold Cross-Validation',
    'mean_accuracy': float(mean_accuracy),
    'std_accuracy': float(std_accuracy),
    'min_accuracy': float(min(baseline_results['fold_accuracies'])),
    'max_accuracy': float(max(baseline_results['fold_accuracies'])),
    'total_training_time': float(total_time),
    'fold_results': [
        {
            'test_fold': d['test_fold'],
            'val_fold': d['val_fold'],
            'test_accuracy': float(d['test_accuracy']),
            'val_accuracy': float(d['val_accuracy']),
            'precision': float(d['precision']),
            'recall': float(d['recall']),
            'f1': float(d['f1'])
        }
        for d in baseline_results['fold_details']
    ]
}

# Save
results_path = f"{CONFIG['results_dir']}/baseline_results_10fold.json"
with open(results_path, 'w') as f:
    json.dump(baseline_summary, f, indent=4)

print(f"\nBaseline results saved to: {results_path}")

### VISUALIZING BASELINE RESULTS

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

# Plot fold accuracies
folds = list(range(1, 11))
accuracies = [d['test_accuracy']*100 for d in baseline_results['fold_details']]

bars = ax.bar(folds, accuracies, color='steelblue', alpha=0.8)

# Add mean line
ax.axhline(y=mean_accuracy*100, color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {mean_accuracy*100:.2f}%')

# Add std bands
ax.axhspan(mean_accuracy*100 - std_accuracy*100, 
           mean_accuracy*100 + std_accuracy*100, 
           alpha=0.2, color='red', label=f'±1 Std: {std_accuracy*100:.2f}%')

# Customize
ax.set_xlabel('Test Fold', fontsize=12, fontweight='bold')
ax.set_ylabel('Test Accuracy (%)', fontsize=12, fontweight='bold')
ax.set_title('Baseline (Naive Bayes) - 10-Fold Cross-Validation Results', 
             fontsize=14, fontweight='bold')
ax.set_xticks(folds)
ax.grid(axis='y', alpha=0.3)
ax.legend()

# Add values on bars
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{acc:.1f}%', ha='center', va='bottom', fontsize=9)

plt.tight_layout()

# Save
fig_path = f"{CONFIG['figures_dir']}/baseline_10fold_results.png"
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"\nFigure saved to: {fig_path}")