In [2]:
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
# 1. ENHANCED FEATURE EXTRACTION FUNCTION
def extract_features(file_path, n_mfcc=40, n_mels=128, n_chroma=12):
    """
    Extract multiple audio features for better representation
    """
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        
        features = []
        
        # 1. MFCCs (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        features.extend(mfccs_scaled)
        
        # 2. Mel Spectrogram
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
        mel_scaled = np.mean(mel_spec.T, axis=0)
        features.extend(mel_scaled)
        
        # 3. Chroma Features
        chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate, n_chroma=n_chroma)
        chroma_scaled = np.mean(chroma.T, axis=0)
        features.extend(chroma_scaled)
        
        # 4. Spectral Contrast
        spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sample_rate)
        spectral_scaled = np.mean(spectral_contrast.T, axis=0)
        features.extend(spectral_scaled)
        
        # 5. Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(y=audio)
        zcr_scaled = np.mean(zcr)
        features.append(zcr_scaled)
        
        # 6. RMS Energy
        rms = librosa.feature.rms(y=audio)
        rms_scaled = np.mean(rms)
        features.append(rms_scaled)
        
        # 7. Spectral Centroid
        spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sample_rate)
        centroid_scaled = np.mean(spectral_centroid)
        features.append(centroid_scaled)
        
        # 8. Spectral Bandwidth
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sample_rate)
        bandwidth_scaled = np.mean(spectral_bandwidth)
        features.append(bandwidth_scaled)
        
        # 9. Spectral Roll-off
        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sample_rate)
        rolloff_scaled = np.mean(spectral_rolloff)
        features.append(rolloff_scaled)
        
        # 10. Harmonic and Percussive components
        y_harmonic, y_percussive = librosa.effects.hpss(audio)
        harmonic_scaled = np.mean(y_harmonic)
        percussive_scaled = np.mean(y_percussive)
        features.extend([harmonic_scaled, percussive_scaled])
        
        # 11. Tempogram (for rhythmic features)
        onset_env = librosa.onset.onset_strength(y=audio, sr=sample_rate)
        tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sample_rate)
        features.append(tempo[0])
        
        return np.array(features)
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

In [5]:
# 2. IMPROVED DATA LOADING WITH AUGMENTATION
def load_and_preprocess_data(base_path, metadata_file='UrbanSound8K/metadata/UrbanSound8K.csv'):
    """
    Load UrbanSound8K dataset with optional data augmentation
    """
    # Load metadata
    metadata = pd.read_csv(metadata_file)
    
    features = []
    labels = []
    
    print("Loading and extracting features...")
    
    for idx, row in metadata.iterrows():
        # Construct file path
        fold = row['fold']
        filename = row['slice_file_name']
        file_path = f"{base_path}/fold{fold}/{filename}"
        
        # Extract features from original audio
        feature = extract_features(file_path)
        
        if feature is not None:
            features.append(feature)
            labels.append(row['class'])
            
            # DATA AUGMENTATION - Add variations
            try:
                audio, sr = librosa.load(file_path, sr=None)
                
                # Augmentation 1: Add noise
                noise = np.random.randn(len(audio)) * 0.005
                audio_noisy = audio + noise
                temp_path = "temp_aug.wav"
                librosa.output.write_wav(temp_path, audio_noisy, sr)
                feature_noisy = extract_features(temp_path)
                if feature_noisy is not None:
                    features.append(feature_noisy)
                    labels.append(row['class'])
                
                # Augmentation 2: Time stretching
                audio_stretched = librosa.effects.time_stretch(audio, rate=0.9)
                librosa.output.write_wav(temp_path, audio_stretched, sr)
                feature_stretched = extract_features(temp_path)
                if feature_stretched is not None:
                    features.append(feature_stretched)
                    labels.append(row['class'])
                
                # Augmentation 3: Pitch shifting
                audio_pitch = librosa.effects.pitch_shift(audio, sr, n_steps=2)
                librosa.output.write_wav(temp_path, audio_pitch, sr)
                feature_pitch = extract_features(temp_path)
                if feature_pitch is not None:
                    features.append(feature_pitch)
                    labels.append(row['class'])
                    
            except:
                continue
    
    features = np.array(features)
    labels = np.array(labels)
    
    return features, labels

In [6]:
# 3. ENHANCED NEURAL NETWORK ARCHITECTURE
def create_enhanced_model(input_shape, num_classes):
    """
    Create a more sophisticated neural network model
    """
    model = models.Sequential([
        # Input layer
        layers.Input(shape=(input_shape,)),
        
        # Batch Normalization for faster convergence
        layers.BatchNormalization(),
        
        # First Dense block with dropout
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Second Dense block
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Third Dense block
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        # Fourth Dense block
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        # Output layer
        layers.Dense(num_classes, activation='softmax')
    ])
    
    # Compile with different optimizers to find best one
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=0.0005,  # Lower learning rate
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07
    )
    
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )
    
    return model

In [7]:
# 4. CREATE CNN MODEL FOR SPECTROGRAM IMAGES (Alternative approach)
def create_cnn_model(input_shape, num_classes):
    """
    CNN model for spectrogram images
    """
    model = models.Sequential([
        # Convolutional layers
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(256, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        
        # Flatten and dense layers
        layers.Flatten(),
        layers.Dropout(0.5),
        
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        
        layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# 5. ENSEMBLE MODEL (Combines multiple models)
class EnsembleModel:
    def __init__(self, models_list):
        self.models = models_list
    
    def predict(self, X):
        predictions = []
        for model in self.models:
            pred = model.predict(X)
            predictions.append(pred)
        
        # Average predictions
        avg_predictions = np.mean(predictions, axis=0)
        return np.argmax(avg_predictions, axis=1)

# 6. CALLBACKS FOR BETTER TRAINING
def get_callbacks():
    return [
        callbacks.EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        ),
        callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6,
            verbose=1
        ),
        callbacks.ModelCheckpoint(
            'best_model.h5',
            monitor='val_accuracy',
            save_best_only=True,
            mode='max',
            verbose=1
        )
    ]

In [8]:
# 7. MAIN TRAINING PIPELINE
def main():
    # Set base path
    base_path = "UrbanSound8K/audio"
    
    # Load and preprocess data
    print("Step 1: Loading data...")
    X, y = load_and_preprocess_data(base_path)
    
    # Encode labels
    print("Step 2: Encoding labels...")
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    y_categorical = to_categorical(y_encoded)
    
    # Split data
    print("Step 3: Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
    )
    
    # Standardize features
    print("Step 4: Standardizing features...")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Create and train model
    print("Step 5: Creating model...")
    input_shape = X_train.shape[1]
    num_classes = len(np.unique(y_encoded))
    
    model = create_enhanced_model(input_shape, num_classes)
    model.summary()
    
    # Train model
    print("Step 6: Training model...")
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        callbacks=get_callbacks(),
        verbose=1
    )
    
    # Evaluate model
    print("Step 7: Evaluating model...")
    test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)
    
    print(f"\nTest Accuracy: {test_accuracy:.4f}")
    print(f"Test Precision: {test_precision:.4f}")
    print(f"Test Recall: {test_recall:.4f}")
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test_classes, y_pred_classes, 
                                target_names=label_encoder.classes_))
    
    # Confusion matrix
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_test_classes, y_pred_classes)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    # Plot training history
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    
    axes[0, 0].plot(history.history['accuracy'], label='Train Accuracy')
    axes[0, 0].plot(history.history['val_accuracy'], label='Val Accuracy')
    axes[0, 0].set_title('Model Accuracy')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].legend()
    axes[0, 0].grid(True)
    
    axes[0, 1].plot(history.history['loss'], label='Train Loss')
    axes[0, 1].plot(history.history['val_loss'], label='Val Loss')
    axes[0, 1].set_title('Model Loss')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Loss')
    axes[0, 1].legend()
    axes[0, 1].grid(True)
    
    axes[1, 0].plot(history.history['precision'], label='Train Precision')
    axes[1, 0].plot(history.history['val_precision'], label='Val Precision')
    axes[1, 0].set_title('Model Precision')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Precision')
    axes[1, 0].legend()
    axes[1, 0].grid(True)
    
    axes[1, 1].plot(history.history['recall'], label='Train Recall')
    axes[1, 1].plot(history.history['val_recall'], label='Val Recall')
    axes[1, 1].set_title('Model Recall')
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Recall')
    axes[1, 1].legend()
    axes[1, 1].grid(True)
    
    plt.tight_layout()
    plt.show()
    
    return model, scaler, label_encoder, history

In [9]:
# 8. HYPERPARAMETER TUNING FUNCTION
def hyperparameter_tuning(X_train, y_train):
    """
    Perform hyperparameter tuning (simplified version)
    """
    from sklearn.model_selection import GridSearchCV
    from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
    
    def create_model(learning_rate=0.001, dropout_rate=0.3):
        model = models.Sequential([
            layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
            layers.Dropout(dropout_rate),
            layers.Dense(128, activation='relu'),
            layers.Dropout(dropout_rate),
            layers.Dense(64, activation='relu'),
            layers.Dense(y_train.shape[1], activation='softmax')
        ])
        
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer,
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
        return model
    
    model = KerasClassifier(build_fn=create_model, verbose=0)
    
    param_grid = {
        'batch_size': [16, 32, 64],
        'epochs': [50, 100],
        'learning_rate': [0.001, 0.0005, 0.0001],
        'dropout_rate': [0.2, 0.3, 0.4]
    }
    
    # Note: This can be computationally expensive
    print("Hyperparameter tuning might take a while...")
    # grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    # grid_result = grid.fit(X_train, y_train)
    
    # print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

In [None]:
# Run the main function
if __name__ == "__main__":
    model, scaler, label_encoder, history = main()
    
    # Save the model
    model.save('saved_models/urban_sound_classifier.h5')
    print("\nModel saved as 'saved_models/urban_sound_classifier.h5'")

Step 1: Loading data...
Loading and extracting features...
