In [9]:
import librosa
import librosa.display
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

class RobustMFCCExtractor:
    """
    Enhanced MFCC feature extraction with multiple robust techniques
    """
    
    def __init__(self, sr=22050, n_mfcc=40, n_fft=2048, hop_length=512):
        self.sr = sr
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        
    def extract_mfcc_features(self, audio_path, augment=False):
        """
        Extract comprehensive MFCC-based features
        """
        try:
            # Load audio with robust loading
            y, sr = librosa.load(audio_path, sr=self.sr, duration=4.0)
            
            # Zero-padding or truncation for consistent length
            target_length = self.sr * 4  # 4 seconds
            if len(y) < target_length:
                y = np.pad(y, (0, target_length - len(y)), mode='constant')
            else:
                y = y[:target_length]
            
            # Apply pre-emphasis filter
            y = librosa.effects.preemphasis(y)
            
            # Extract base MFCCs
            mfccs = librosa.feature.mfcc(
                y=y, 
                sr=sr, 
                n_mfcc=self.n_mfcc,
                n_fft=self.n_fft,
                hop_length=self.hop_length
            )
            
            # Extract delta and delta-delta features
            mfcc_delta = librosa.feature.delta(mfccs)
            mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
            
            # Extract other complementary features
            chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=self.hop_length)
            spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=self.hop_length)
            tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=self.n_fft, hop_length=self.hop_length)
            
            # Root Mean Square Energy
            rms = librosa.feature.rms(y=y, hop_length=self.hop_length)
            
            # Zero Crossing Rate
            zcr = librosa.feature.zero_crossing_rate(y, hop_length=self.hop_length)
            
            # Spectral Centroid and Rolloff
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=self.hop_length)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=self.hop_length)
            
            # Combine all features
            features = np.vstack([
                mfccs,
                mfcc_delta,
                mfcc_delta2,
                chroma,
                spectral_contrast,
                tonnetz,
                rms,
                zcr,
                spectral_centroid,
                spectral_rolloff
            ])
            
            # Extract statistical features
            feature_stats = self._extract_statistics(features)
            
            return feature_stats
            
        except Exception as e:
            print(f"Error processing {audio_path}: {str(e)}")
            return np.zeros(280)  # Return zero array with expected feature dimension
    
    def _extract_statistics(self, features):
        """
        Extract statistical features from feature matrix
        """
        stats = []
        for feature in features:
            stats.extend([
                np.mean(feature),
                np.std(feature),
                np.median(feature),
                np.min(feature),
                np.max(feature),
                np.percentile(feature, 25),
                np.percentile(feature, 75),
                np.mean(np.diff(feature)),  # Average change
                np.std(np.diff(feature))    # Std of changes
            ])
        return np.array(stats)
    
    def extract_log_mel_spectrogram(self, audio_path):
        """
        Alternative: Log-Mel Spectrogram features
        """
        y, sr = librosa.load(audio_path, sr=self.sr, duration=4.0)
        
        # Extract Mel-spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y, 
            sr=sr, 
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            n_mels=128
        )
        
        # Convert to log scale
        log_mel = librosa.power_to_db(mel_spec, ref=np.max)
        
        return log_mel
    
    def extract_all_features(self, audio_paths, labels=None):
        """
        Extract features from multiple audio files
        """
        features = []
        valid_paths = []
        valid_labels = []
        
        for idx, path in enumerate(audio_paths):
            feat = self.extract_mfcc_features(path)
            if np.sum(feat) != 0:  # Skip zero features
                features.append(feat)
                valid_paths.append(path)
                if labels is not None:
                    valid_labels.append(labels[idx])
        
        features = np.array(features)
        
        # Normalize features
        if len(features) > 0:
            features = self.scaler.fit_transform(features)
        
        # Encode labels if provided
        if labels is not None and len(valid_labels) > 0:
            encoded_labels = self.label_encoder.fit_transform(valid_labels)
        else:
            encoded_labels = None
        
        return features, encoded_labels, valid_paths

In [10]:
import os
import glob
from tqdm import tqdm

class UrbanSound8KProcessor:
    """
    Process UrbanSound8K dataset
    """
    
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.metadata_path = os.path.join(dataset_path, 'metadata', 'UrbanSound8K.csv')
        self.metadata = pd.read_csv(self.metadata_path)
        
    def prepare_data(self, folds=None):
        """
        Prepare data for specific folds or all folds
        """
        if folds is None:
            folds = list(range(1, 11))
        elif isinstance(folds, int):
            folds = [folds]
        
        audio_paths = []
        labels = []
        fold_numbers = []
        
        for fold in folds:
            fold_data = self.metadata[self.metadata['fold'] == fold]
            
            for _, row in tqdm(fold_data.iterrows(), desc=f'Processing Fold {fold}'):
                audio_file = os.path.join(
                    self.dataset_path,
                    'fold' + str(row['fold']),
                    row['slice_file_name']
                )
                
                if os.path.exists(audio_file):
                    audio_paths.append(audio_file)
                    labels.append(row['class'])
                    fold_numbers.append(fold)
        
        return audio_paths, labels, fold_numbers
    
    def get_class_distribution(self):
        """
        Get class distribution statistics
        """
        return self.metadata['class'].value_counts()

In [17]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

class TraditionalMLClassifier:
    """
    Traditional machine learning classifiers
    """
    
    def __init__(self):
        self.models = {
            'random_forest': RandomForestClassifier(n_estimators=200, random_state=42),
            'svm': SVC(probability=True, random_state=42),
            'xgboost': XGBClassifier(n_estimators=200, random_state=42),
            'gradient_boosting': GradientBoostingClassifier(n_estimators=200, random_state=42)
        }
        
    def train(self, X_train, y_train):
        """
        Train all models
        """
        trained_models = {}
        
        for name, model in self.models.items():
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            trained_models[name] = model
            
        return trained_models
    
    def evaluate(self, models, X_test, y_test):
        """
        Evaluate all models
        """
        results = {}
        
        for name, model in models.items():
            accuracy = model.score(X_test, y_test)
            results[name] = accuracy
            print(f"{name} Accuracy: {accuracy:.4f}")
            
        return results

In [13]:
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

class CNNClassifier:
    """
    CNN-based classifier for audio features
    """
    
    def __init__(self, input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.model = self._build_model()
        
    def _build_model(self):
        """
        Build CNN model architecture
        """
        model = models.Sequential([
            # Reshape for CNN input
            layers.Reshape((self.input_shape[0], self.input_shape[1], 1), 
                          input_shape=self.input_shape),
            
            # First Conv Block
            layers.Conv2D(64, (3, 3), padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2, 2)),
            layers.Dropout(0.25),
            
            # Second Conv Block
            layers.Conv2D(128, (3, 3), padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2, 2)),
            layers.Dropout(0.25),
            
            # Third Conv Block
            layers.Conv2D(256, (3, 3), padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2, 2)),
            layers.Dropout(0.25),
            
            # Global Pooling and Dense Layers
            layers.GlobalAveragePooling2D(),
            layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
            layers.BatchNormalization(),
            layers.Dropout(0.5),
            
            layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
            layers.BatchNormalization(),
            layers.Dropout(0.5),
            
            # Output layer
            layers.Dense(self.num_classes, activation='softmax')
        ])
        
        model.compile(
            optimizer='adam',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def train(self, X_train, y_train, X_val, y_val, epochs=100, batch_size=32):
        """
        Train CNN model
        """
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
        ]
        
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1
        )
        
        return history

In [20]:
def main():
    # Initialize paths
    DATASET_PATH = "UrbanSound8K/"  # Update this path
    OUTPUT_PATH = "results"
    
    # Create output directory
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    
    # Step 1: Process data
    print("Step 1: Processing UrbanSound8K dataset...")
    processor = UrbanSound8KProcessor(DATASET_PATH)
    
    # Prepare data for folds 1-8 for training, 9-10 for testing
    train_folds = list(range(1, 9))
    test_folds = [9, 10]
    
    train_paths, train_labels, _ = processor.prepare_data(train_folds)
    test_paths, test_labels, _ = processor.prepare_data(test_folds)
    
    print(f"Training samples: {len(train_paths)}")
    print(f"Testing samples: {len(test_paths)}")
    
    # Step 2: Extract features
    print("\nStep 2: Extracting features...")
    extractor = RobustMFCCExtractor()
    
    X_train, y_train, _ = extractor.extract_all_features(train_paths, train_labels)
    X_test, y_test, _ = extractor.extract_all_features(test_paths, test_labels)
    
    print(f"Feature shape - Train: {X_train.shape}, Test: {X_test.shape}")
    
    # Step 3: Train traditional models
    print("\nStep 3: Training traditional ML models...")
    ml_classifier = TraditionalMLClassifier()
    trained_models = ml_classifier.train(X_train, y_train)
    
    # Step 4: Evaluate models
    print("\nStep 4: Evaluating models...")
    results = ml_classifier.evaluate(trained_models, X_test, y_test)
    
    # Step 5: For CNN - reshape features for 2D input
    print("\nStep 5: Training CNN model...")
    
    # Reshape features to 2D (assuming we use log-mel spectrograms)
    X_train_cnn = []
    X_test_cnn = []
    
    for path in tqdm(train_paths, desc="Extracting CNN features for train"):
        log_mel = extractor.extract_log_mel_spectrogram(path)
        X_train_cnn.append(log_mel)
    
    for path in tqdm(test_paths, desc="Extracting CNN features for test"):
        log_mel = extractor.extract_log_mel_spectrogram(path)
        X_test_cnn.append(log_mel)
    
    X_train_cnn = np.array(X_train_cnn)
    X_test_cnn = np.array(X_test_cnn)
    
    # Encode labels for CNN
    label_encoder = LabelEncoder()
    y_train_cnn = label_encoder.fit_transform(train_labels)
    y_test_cnn = label_encoder.transform(test_labels)
    
    # Build and train CNN
    cnn_classifier = CNNClassifier(
        input_shape=X_train_cnn[0].shape,
        num_classes=len(np.unique(y_train_cnn))
    )
    
    history = cnn_classifier.train(
        X_train_cnn, y_train_cnn,
        X_test_cnn, y_test_cnn,
        epochs=50,
        batch_size=32
    )
    
    # Evaluate CNN
    cnn_test_loss, cnn_test_acc = cnn_classifier.model.evaluate(X_test_cnn, y_test_cnn)
    print(f"\nCNN Test Accuracy: {cnn_test_acc:.4f}")
    
    # Step 6: Save results
    results_df = pd.DataFrame({
        'Model': list(results.keys()) + ['CNN'],
        'Accuracy': list(results.values()) + [cnn_test_acc]
    })
    
    results_df.to_csv(os.path.join(OUTPUT_PATH, 'model_results.csv'), index=False)
    print("\nResults saved to 'results/model_results.csv'")

if __name__ == "__main__":
    main()

Step 1: Processing UrbanSound8K dataset...


Processing Fold 1: 873it [00:00, 17267.43it/s]
Processing Fold 2: 888it [00:00, 38398.13it/s]
Processing Fold 3: 925it [00:00, 9325.92it/s]
Processing Fold 4: 990it [00:00, 14548.66it/s]
Processing Fold 5: 936it [00:00, 14790.82it/s]
Processing Fold 6: 823it [00:00, 14524.70it/s]
Processing Fold 7: 838it [00:00, 13646.00it/s]
Processing Fold 8: 806it [00:00, 13961.67it/s]
Processing Fold 9: 816it [00:00, 14644.94it/s]
Processing Fold 10: 837it [00:00, 13699.44it/s]


Training samples: 0
Testing samples: 0

Step 2: Extracting features...
Feature shape - Train: (0,), Test: (0,)

Step 3: Training traditional ML models...
Training random_forest...


ValueError: This RandomForestClassifier estimator requires y to be passed, but the target y is None.

In [15]:
class AdvancedMFCCTechniques:
    """
    Advanced techniques for improved MFCC extraction
    """
    
    @staticmethod
    def apply_vtlp(audio, sr, alpha=0.9):
        """
        Vocal Tract Length Perturbation
        """
        # Implement VTLP warping
        n_fft = 2048
        mel_basis = librosa.filters.mel(sr, n_fft)
        warped_mel_basis = AdvancedMFCCTechniques._warp_mel_basis(mel_basis, alpha)
        
        # Apply warped mel basis
        stft = librosa.stft(audio, n_fft=n_fft)
        mel_spec = np.dot(warped_mel_basis, np.abs(stft))
        
        return librosa.power_to_db(mel_spec)
    
    @staticmethod
    def _warp_mel_basis(mel_basis, alpha):
        """
        Warp mel basis for VTLP
        """
        # Simplified warping - implement full VTLP for production
        return mel_basis
    
    @staticmethod
    def extract_mfcc_with_snr_weighting(audio, sr, snr_threshold=20):
        """
        Extract MFCCs with SNR-based weighting
        """
        # Calculate SNR
        signal_power = np.mean(audio**2)
        noise_power = np.mean((audio - np.mean(audio))**2)
        snr = 10 * np.log10(signal_power / (noise_power + 1e-10))
        
        # Weight features based on SNR
        if snr < snr_threshold:
            # Apply noise reduction or use robust features
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
            # Apply additional processing for noisy audio
            return mfccs * (snr / snr_threshold)
        else:
            return librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    
    @staticmethod
    def time_frequency_masking(mfcc_features, time_mask=2, freq_mask=2):
        """
        Apply SpecAugment-like time and frequency masking
        """
        # Time masking
        if time_mask > 0:
            t = np.random.randint(0, mfcc_features.shape[1] - time_mask)
            mfcc_features[:, t:t+time_mask] = 0
        
        # Frequency masking
        if freq_mask > 0:
            f = np.random.randint(0, mfcc_features.shape[0] - freq_mask)
            mfcc_features[f:f+freq_mask, :] = 0
        
        return mfcc_features

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_model_performance(model, X_test, y_test, label_encoder):
    """
    Comprehensive model evaluation
    """
    # Predictions
    y_pred = model.predict(X_test)
    
    if len(y_pred.shape) > 1:  # For neural networks
        y_pred_classes = np.argmax(y_pred, axis=1)
        y_pred_proba = y_pred
    else:  # For traditional models
        y_pred_classes = y_pred
        y_pred_proba = model.predict_proba(X_test)
    
    # Classification report
    print("\n" + "="*50)
    print("CLASSIFICATION REPORT")
    print("="*50)
    print(classification_report(y_test, y_pred_classes, 
                                target_names=label_encoder.classes_))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred_classes)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png', dpi=300)
    plt.show()
    
    # ROC-AUC for multi-class
    try:
        roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
        print(f"\nROC-AUC Score: {roc_auc:.4f}")
    except:
        print("\nROC-AUC calculation skipped (requires probability estimates)")

In [None]:
import os
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

# ============================================
# 1. Enhanced Feature Extraction (FIXED)
# ============================================

class RobustMFCCExtractor:
    """
    Enhanced MFCC feature extraction with multiple robust techniques
    """
    
    def __init__(self, sr=22050, n_mfcc=40, n_fft=2048, hop_length=512):
        self.sr = sr
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        
    def extract_mfcc_features(self, audio_path):
        """
        Extract comprehensive MFCC-based features
        """
        try:
            # Load audio with robust loading
            y, sr = librosa.load(audio_path, sr=self.sr, duration=4.0)
            
            # Zero-padding or truncation for consistent length
            target_length = self.sr * 4  # 4 seconds
            if len(y) < target_length:
                y = np.pad(y, (0, target_length - len(y)), mode='constant')
            else:
                y = y[:target_length]
            
            # Apply pre-emphasis filter
            y = librosa.effects.preemphasis(y)
            
            # Extract base MFCCs
            mfccs = librosa.feature.mfcc(
                y=y, 
                sr=sr, 
                n_mfcc=self.n_mfcc,
                n_fft=self.n_fft,
                hop_length=self.hop_length
            )
            
            # Extract delta and delta-delta features
            mfcc_delta = librosa.feature.delta(mfccs)
            mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
            
            # Extract other complementary features
            chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=self.hop_length)
            spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=self.hop_length)
            
            # Root Mean Square Energy
            rms = librosa.feature.rms(y=y, hop_length=self.hop_length)
            
            # Zero Crossing Rate
            zcr = librosa.feature.zero_crossing_rate(y, hop_length=self.hop_length)
            
            # Spectral Centroid and Rolloff
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=self.hop_length)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=self.hop_length)
            
            # Combine all features
            features = np.vstack([
                mfccs,
                mfcc_delta,
                mfcc_delta2,
                chroma,
                spectral_contrast,
                rms,
                zcr,
                spectral_centroid,
                spectral_rolloff
            ])
            
            # Extract statistical features
            feature_stats = self._extract_statistics(features)
            
            return feature_stats
            
        except Exception as e:
            print(f"Error processing {audio_path}: {str(e)}")
            # Return zeros with appropriate dimension
            # Calculate expected dimension: (40 + 40 + 40 + 12 + 7 + 1 + 1 + 1 + 1) * 9 = 1026
            return np.zeros(1026)  # Fixed dimension
    
    def _extract_statistics(self, features):
        """
        Extract statistical features from feature matrix
        """
        stats = []
        for feature in features:
            stats.extend([
                np.mean(feature),
                np.std(feature),
                np.median(feature),
                np.min(feature),
                np.max(feature),
                np.percentile(feature, 25),
                np.percentile(feature, 75),
                np.mean(np.diff(feature)),  # Average change
                np.std(np.diff(feature))    # Std of changes
            ])
        return np.array(stats)
    
    def extract_all_features(self, audio_paths, labels=None):
        """
        Extract features from multiple audio files
        """
        features = []
        valid_labels = []
        
        for idx, path in enumerate(tqdm(audio_paths, desc="Extracting features")):
            feat = self.extract_mfcc_features(path)
            if np.sum(feat) != 0 and not np.isnan(feat).any():  # Skip zero or NaN features
                features.append(feat)
                if labels is not None:
                    valid_labels.append(labels[idx])
        
        if len(features) == 0:
            print("WARNING: No valid features extracted!")
            return np.array([]), np.array([])
        
        features = np.array(features)
        
        # Normalize features
        features = self.scaler.fit_transform(features)
        
        # Encode labels if provided
        if labels is not None and len(valid_labels) > 0:
            encoded_labels = self.label_encoder.fit_transform(valid_labels)
        else:
            encoded_labels = None
        
        return features, encoded_labels

# ============================================
# 2. UrbanSound8K Processor (FIXED)
# ============================================

class UrbanSound8KProcessor:
    """
    Process UrbanSound8K dataset
    """
    
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.metadata_path = os.path.join(dataset_path, 'metadata', 'UrbanSound8K.csv')
        if not os.path.exists(self.metadata_path):
            # Try alternative path structure
            self.metadata_path = os.path.join(dataset_path, 'UrbanSound8K.csv')
        
        if not os.path.exists(self.metadata_path):
            raise FileNotFoundError(f"Metadata file not found at {self.metadata_path}")
        
        self.metadata = pd.read_csv(self.metadata_path)
        
    def prepare_data(self, folds=None):
        """
        Prepare data for specific folds or all folds
        """
        if folds is None:
            folds = list(range(1, 11))
        elif isinstance(folds, int):
            folds = [folds]
        
        audio_paths = []
        labels = []
        fold_numbers = []
        
        for fold in folds:
            fold_data = self.metadata[self.metadata['fold'] == fold]
            
            for _, row in fold_data.iterrows():
                # Try different possible paths
                possible_paths = [
                    os.path.join(self.dataset_path, 'fold' + str(row['fold']), row['slice_file_name']),
                    os.path.join(self.dataset_path, 'audio', 'fold' + str(row['fold']), row['slice_file_name']),
                    os.path.join(self.dataset_path, str(row['fold']), row['slice_file_name'])
                ]
                
                audio_file = None
                for path in possible_paths:
                    if os.path.exists(path):
                        audio_file = path
                        break
                
                if audio_file:
                    audio_paths.append(audio_file)
                    labels.append(row['class'])
                    fold_numbers.append(fold)
                else:
                    print(f"WARNING: File not found: {row['slice_file_name']} in fold {fold}")
        
        print(f"Found {len(audio_paths)} valid audio files out of {len(fold_data) * len(folds)} expected")
        return audio_paths, labels, fold_numbers
    
    def get_class_distribution(self):
        """
        Get class distribution statistics
        """
        return self.metadata['class'].value_counts()

# ============================================
# 3. Traditional ML Classifier
# ============================================

class TraditionalMLClassifier:
    """
    Traditional machine learning classifiers
    """
    
    def __init__(self):
        self.models = {
            'random_forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
            'svm': SVC(probability=True, random_state=42),
            'xgboost': XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1),
            'gradient_boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
        }
        
    def train(self, X_train, y_train):
        """
        Train all models
        """
        if X_train.shape[0] == 0 or y_train is None:
            raise ValueError("Training data is empty or labels are None!")
            
        trained_models = {}
        
        for name, model in self.models.items():
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            trained_models[name] = model
            
        return trained_models
    
    def evaluate(self, models, X_test, y_test):
        """
        Evaluate all models
        """
        results = {}
        
        for name, model in models.items():
            accuracy = model.score(X_test, y_test)
            results[name] = accuracy
            print(f"{name} Accuracy: {accuracy:.4f}")
            
        return results

# ============================================
# 4. Main Execution Pipeline (FIXED)
# ============================================

def main():
    # Initialize paths
    # UPDATE THIS PATH to your UrbanSound8K dataset location
    DATASET_PATH = "UrbanSound8K/"  # Update this path
    
    # Create output directory
    os.makedirs("results", exist_ok=True)
    
    # Step 1: Process data
    print("="*60)
    print("Step 1: Processing UrbanSound8K dataset...")
    print("="*60)
    
    try:
        processor = UrbanSound8KProcessor(DATASET_PATH)
    except FileNotFoundError as e:
        print(f"ERROR: {e}")
        print("\nPlease ensure:")
        print("1. You have downloaded UrbanSound8K dataset")
        print("2. The dataset is extracted in the correct location")
        print(f"3. Update DATASET_PATH variable (currently: {DATASET_PATH})")
        print("\nDownload from: https://urbansounddataset.weebly.com/urbansound8k.html")
        return
    
    # Show class distribution
    class_dist = processor.get_class_distribution()
    print("\nClass Distribution:")
    print(class_dist)
    print(f"\nTotal samples: {len(processor.metadata)}")
    
    # Prepare data for folds 1-8 for training, 9-10 for testing
    train_folds = list(range(1, 9))
    test_folds = [9, 10]
    
    print(f"\nPreparing training data (folds {train_folds})...")
    train_paths, train_labels, _ = processor.prepare_data(train_folds)
    
    print(f"\nPreparing testing data (folds {test_folds})...")
    test_paths, test_labels, _ = processor.prepare_data(test_folds)
    
    print(f"\nTraining samples: {len(train_paths)}")
    print(f"Testing samples: {len(test_paths)}")
    
    if len(train_paths) == 0 or len(test_paths) == 0:
        print("ERROR: No audio files found!")
        print("Check if audio files exist in the dataset directory")
        return
    
    # Step 2: Extract features
    print("\n" + "="*60)
    print("Step 2: Extracting features...")
    print("="*60)
    
    extractor = RobustMFCCExtractor()
    
    print("Extracting training features...")
    X_train, y_train = extractor.extract_all_features(train_paths, train_labels)
    
    print("\nExtracting testing features...")
    X_test, y_test = extractor.extract_all_features(test_paths, test_labels)
    
    print(f"\nFeature shape - Train: {X_train.shape if len(X_train) > 0 else 'Empty'}")
    print(f"Feature shape - Test: {X_test.shape if len(X_test) > 0 else 'Empty'}")
    
    if len(X_train) == 0 or len(X_test) == 0:
        print("ERROR: Feature extraction failed!")
        print("Possible issues:")
        print("1. Audio files might be corrupted")
        print("2. Librosa might not be reading the files")
        print("3. Check file formats (should be .wav)")
        return
    
    # Check if labels are extracted
    if y_train is None or y_test is None:
        print("ERROR: Labels not extracted!")
        return
    
    # Step 3: Train traditional models
    print("\n" + "="*60)
    print("Step 3: Training traditional ML models...")
    print("="*60)
    
    ml_classifier = TraditionalMLClassifier()
    
    try:
        trained_models = ml_classifier.train(X_train, y_train)
    except ValueError as e:
        print(f"Training failed: {e}")
        return
    
    # Step 4: Evaluate models
    print("\n" + "="*60)
    print("Step 4: Evaluating models...")
    print("="*60)
    
    results = ml_classifier.evaluate(trained_models, X_test, y_test)
    
    # Step 5: Detailed evaluation for best model
    print("\n" + "="*60)
    print("Step 5: Detailed evaluation...")
    print("="*60)
    
    # Find best model
    best_model_name = max(results, key=results.get)
    best_model = trained_models[best_model_name]
    print(f"\nBest model: {best_model_name} (Accuracy: {results[best_model_name]:.4f})")
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test) if hasattr(best_model, 'predict_proba') else None
    
    # Classification report
    print("\nClassification Report:")
    print("="*50)
    print(classification_report(y_test, y_pred, 
                                target_names=extractor.label_encoder.classes_))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=extractor.label_encoder.classes_,
                yticklabels=extractor.label_encoder.classes_)
    plt.title(f'Confusion Matrix - {best_model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('results/confusion_matrix.png', dpi=300)
    plt.show()
    
    # ROC-AUC if probabilities available
    if y_pred_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
            print(f"\nROC-AUC Score (macro): {roc_auc:.4f}")
        except:
            print("\nROC-AUC calculation skipped")
    
    # Step 6: Save results
    results_df = pd.DataFrame({
        'Model': list(results.keys()),
        'Accuracy': list(results.values())
    })
    
    results_df = results_df.sort_values('Accuracy', ascending=False)
    results_df.to_csv('results/model_results.csv', index=False)
    
    print("\n" + "="*60)
    print("RESULTS SUMMARY")
    print("="*60)
    print(results_df.to_string(index=False))
    print(f"\nResults saved to 'results/model_results.csv'")
    
    # Save feature information
    feature_info = {
        'train_samples': len(X_train),
        'test_samples': len(X_test),
        'feature_dim': X_train.shape[1] if len(X_train) > 0 else 0,
        'classes': list(extractor.label_encoder.classes_)
    }
    
    import json
    with open('results/feature_info.json', 'w') as f:
        json.dump(feature_info, f, indent=2)

# ============================================
# Alternative: Simple Debug Version
# ============================================

def debug_version():
    """
    Simple debug version to test basic functionality
    """
    print("Running debug version...")
    
    # Test with a single file first
    test_file = "UrbanSound8K/fold1/7061-6-0-0.wav"  # Try with a known file
    
    if not os.path.exists(test_file):
        # Try to find any .wav file
        import glob
        wav_files = glob.glob("UrbanSound8K/**/*.wav", recursive=True)
        if wav_files:
            test_file = wav_files[0]
            print(f"Using file: {test_file}")
        else:
            print("No .wav files found!")
            return
    
    # Test feature extraction
    extractor = RobustMFCCExtractor()
    features = extractor.extract_mfcc_features(test_file)
    print(f"Feature extraction test - Shape: {features.shape}")
    print(f"Features (first 10): {features[:10]}")
    
    # Test loading metadata
    if os.path.exists("UrbanSound8K/UrbanSound8K.csv"):
        metadata = pd.read_csv("UrbanSound8K/UrbanSound8K.csv")
        print(f"\nMetadata loaded: {len(metadata)} rows")
        print(f"Columns: {list(metadata.columns)}")
        print(f"\nClass distribution:")
        print(metadata['class'].value_counts())
        
        # Test with first 10 files
        sample_files = metadata.head(10)
        audio_paths = []
        labels = []
        
        for _, row in sample_files.iterrows():
            path = f"UrbanSound8K/fold{row['fold']}/{row['slice_file_name']}"
            if os.path.exists(path):
                audio_paths.append(path)
                labels.append(row['class'])
        
        print(f"\nFound {len(audio_paths)} valid files out of 10")
        
        if len(audio_paths) > 0:
            X, y = extractor.extract_all_features(audio_paths, labels)
            print(f"\nExtracted features shape: {X.shape}")
            print(f"Labels shape: {y.shape if y is not None else 'None'}")

if __name__ == "__main__":
    print("UrbanSound8K Audio Classification System")
    print("="*60)
    
    # First run debug to check basic functionality
    debug_version()
    
    # Ask user if they want to run full pipeline
    response = input("\nDo you want to run the full pipeline? (yes/no): ")
    if response.lower() in ['yes', 'y']:
        main()
    else:
        print("\nDebug mode completed. Fix any issues before running full pipeline.")

UrbanSound8K Audio Classification System
Running debug version...
Using file: UrbanSound8K\audio\fold1\101415-3-0-2.wav
Feature extraction test - Shape: (1287,)
Features (first 10): [-539.57764898  170.28682325 -606.35736084 -718.22149658 -204.11100769
 -701.68151855 -365.28274536    1.41649184   43.2259207    43.78187459]



Do you want to run the full pipeline? (yes/no):  yes


Step 1: Processing UrbanSound8K dataset...

Class Distribution:
class
dog_bark            1000
children_playing    1000
air_conditioner     1000
street_music        1000
jackhammer          1000
engine_idling       1000
drilling            1000
siren                929
car_horn             429
gun_shot             374
Name: count, dtype: int64

Total samples: 8732

Preparing training data (folds [1, 2, 3, 4, 5, 6, 7, 8])...
Found 7079 valid audio files out of 6448 expected

Preparing testing data (folds [9, 10])...
Found 1653 valid audio files out of 1674 expected

Training samples: 7079
Testing samples: 1653

Step 2: Extracting features...
Extracting training features...


Extracting features: 100%|█████████████████████████████████████████████████████████| 7079/7079 [16:52<00:00,  6.99it/s]



Extracting testing features...


Extracting features: 100%|█████████████████████████████████████████████████████████| 1653/1653 [04:03<00:00,  6.79it/s]



Feature shape - Train: (7079, 1287)
Feature shape - Test: (1653, 1287)

Step 3: Training traditional ML models...
Training random_forest...
Training svm...
