In [None]:
import os
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from scipy.fft import dct, idct
from scipy.sparse.linalg import lsqr
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                           roc_auc_score, accuracy_score, precision_score, 
                           recall_score, f1_score)
import warnings
from tqdm import tqdm
import pickle
import json
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# ============================================
# 1. Enhanced Feature Extraction (Base Class)
# ============================================

class RobustMFCCExtractor:
    """
    Enhanced MFCC feature extraction with multiple robust techniques
    """
    
    def __init__(self, sr=22050, n_mfcc=40, n_fft=2048, hop_length=512):
        self.sr = sr
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        
    def extract_mfcc_features(self, audio_path):
        """
        Extract comprehensive MFCC-based features
        """
        try:
            # Load audio with robust loading
            y, sr = librosa.load(audio_path, sr=self.sr, duration=4.0)
            
            # Zero-padding or truncation for consistent length
            target_length = self.sr * 4  # 4 seconds
            if len(y) < target_length:
                y = np.pad(y, (0, target_length - len(y)), mode='constant')
            else:
                y = y[:target_length]
            
            # Apply pre-emphasis filter
            y = librosa.effects.preemphasis(y)
            
            # Extract base MFCCs
            mfccs = librosa.feature.mfcc(
                y=y, 
                sr=sr, 
                n_mfcc=self.n_mfcc,
                n_fft=self.n_fft,
                hop_length=self.hop_length
            )
            
            # Extract delta and delta-delta features
            mfcc_delta = librosa.feature.delta(mfccs)
            mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
            
            # Extract other complementary features
            chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=self.hop_length)
            spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=self.hop_length)
            
            # Root Mean Square Energy
            rms = librosa.feature.rms(y=y, hop_length=self.hop_length)
            
            # Zero Crossing Rate
            zcr = librosa.feature.zero_crossing_rate(y, hop_length=self.hop_length)
            
            # Spectral Centroid and Rolloff
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=self.hop_length)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=self.hop_length)
            
            # Combine all features
            features = np.vstack([
                mfccs,
                mfcc_delta,
                mfcc_delta2,
                chroma,
                spectral_contrast,
                rms,
                zcr,
                spectral_centroid,
                spectral_rolloff
            ])
            
            # Extract statistical features
            feature_stats = self._extract_statistics(features)
            
            return feature_stats
            
        except Exception as e:
            print(f"Error processing {audio_path}: {str(e)}")
            # Return zeros with appropriate dimension
            return np.zeros(1026)  # Fixed dimension
    
    def _extract_statistics(self, features):
        """
        Extract statistical features from feature matrix
        """
        stats = []
        for feature in features:
            stats.extend([
                np.mean(feature),
                np.std(feature),
                np.median(feature),
                np.min(feature),
                np.max(feature),
                np.percentile(feature, 25),
                np.percentile(feature, 75),
                np.mean(np.diff(feature)),  # Average change
                np.std(np.diff(feature))    # Std of changes
            ])
        return np.array(stats)
    
    def extract_all_features(self, audio_paths, labels=None):
        """
        Extract features from multiple audio files
        """
        features = []
        valid_labels = []
        
        for idx, path in enumerate(tqdm(audio_paths, desc="Extracting features")):
            feat = self.extract_mfcc_features(path)
            if np.sum(feat) != 0 and not np.isnan(feat).any():  # Skip zero or NaN features
                features.append(feat)
                if labels is not None:
                    valid_labels.append(labels[idx])
        
        if len(features) == 0:
            print("WARNING: No valid features extracted!")
            return np.array([]), np.array([])
        
        features = np.array(features)
        
        # Normalize features
        features = self.scaler.fit_transform(features)
        
        # Encode labels if provided
        if labels is not None and len(valid_labels) > 0:
            encoded_labels = self.label_encoder.fit_transform(valid_labels)
        else:
            encoded_labels = None
        
        return features, encoded_labels

# ============================================
# 2. UrbanSound8K Processor
# ============================================

class UrbanSound8KProcessor:
    """
    Process UrbanSound8K dataset
    """
    
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.metadata_path = os.path.join(dataset_path, 'metadata', 'UrbanSound8K.csv')
        if not os.path.exists(self.metadata_path):
            # Try alternative path structure
            self.metadata_path = os.path.join(dataset_path, 'UrbanSound8K.csv')
        
        if not os.path.exists(self.metadata_path):
            raise FileNotFoundError(f"Metadata file not found at {self.metadata_path}")
        
        self.metadata = pd.read_csv(self.metadata_path)
        
    def prepare_data(self, folds=None):
        """
        Prepare data for specific folds or all folds
        """
        if folds is None:
            folds = list(range(1, 11))
        elif isinstance(folds, int):
            folds = [folds]
        
        audio_paths = []
        labels = []
        fold_numbers = []
        
        for fold in folds:
            fold_data = self.metadata[self.metadata['fold'] == fold]
            
            for _, row in fold_data.iterrows():
                # Try different possible paths
                possible_paths = [
                    os.path.join(self.dataset_path, 'fold' + str(row['fold']), row['slice_file_name']),
                    os.path.join(self.dataset_path, 'audio', 'fold' + str(row['fold']), row['slice_file_name']),
                    os.path.join(self.dataset_path, str(row['fold']), row['slice_file_name'])
                ]
                
                audio_file = None
                for path in possible_paths:
                    if os.path.exists(path):
                        audio_file = path
                        break
                
                if audio_file:
                    audio_paths.append(audio_file)
                    labels.append(row['class'])
                    fold_numbers.append(fold)
                else:
                    print(f"WARNING: File not found: {row['slice_file_name']} in fold {fold}")
        
        print(f"Found {len(audio_paths)} valid audio files out of {len(fold_data) * len(folds)} expected")
        return audio_paths, labels, fold_numbers
    
    def get_class_distribution(self):
        """
        Get class distribution statistics
        """
        return self.metadata['class'].value_counts()

# ============================================
# 3. Compression Sensing Modules
# ============================================

class BernoulliCompressor:
    """Bernoulli Random Matrix Compression"""
    
    def __init__(self, compression_ratio=0.5, seed=42):
        """
        Initialize Bernoulli compressor
        
        Parameters:
        -----------
        compression_ratio : float (0-1)
            Ratio of compressed dimension to original dimension
        seed : int
            Random seed for reproducibility
        """
        self.compression_ratio = compression_ratio
        self.seed = seed
        self.measurement_matrix = None
        self.reconstruction_matrix = None
        
    def create_bernoulli_matrix(self, n_original, n_compressed):
        """
        Create Bernoulli random measurement matrix
        
        Parameters:
        -----------
        n_original : int
            Original signal dimension
        n_compressed : int
            Compressed dimension
            
        Returns:
        --------
        measurement_matrix : np.array
            Bernoulli random matrix
        """
        np.random.seed(self.seed)
        # Bernoulli matrix with entries +1/sqrt(n_compressed) and -1/sqrt(n_compressed)
        bernoulli_values = np.random.choice([1, -1], size=(n_compressed, n_original))
        bernoulli_matrix = bernoulli_values / np.sqrt(n_compressed)
        return bernoulli_matrix
    
    def compress(self, signal):
        """
        Compress signal using Bernoulli random matrix
        
        Parameters:
        -----------
        signal : np.array
            Original signal (1D array)
            
        Returns:
        --------
        compressed_signal : np.array
            Compressed signal
        """
        n_original = len(signal)
        n_compressed = int(n_original * self.compression_ratio)
        
        # Create measurement matrix
        self.measurement_matrix = self.create_bernoulli_matrix(n_original, n_compressed)
        
        # Compress signal
        compressed_signal = np.dot(self.measurement_matrix, signal)
        
        return compressed_signal
    
    def reconstruct_l1(self, compressed_signal, max_iter=100):
        """
        Reconstruct original signal using L1 minimization (Basis Pursuit)
        
        Parameters:
        -----------
        compressed_signal : np.array
            Compressed signal
        max_iter : int
            Maximum iterations for reconstruction
            
        Returns:
        --------
        reconstructed_signal : np.array
            Reconstructed signal
        """
        if self.measurement_matrix is None:
            raise ValueError("Measurement matrix not created. Run compress() first.")
        
        n_original = self.measurement_matrix.shape[1]
        
        # Use least squares with L1 regularization (simplified)
        reconstructed_signal = lsqr(self.measurement_matrix, compressed_signal, iter_lim=max_iter)[0]
        
        return reconstructed_signal

class DWTCompressor:
    """Discrete Wavelet Transform (Haar) Compression"""
    
    def __init__(self, compression_ratio=0.5, wavelet='haar'):
        """
        Initialize DWT compressor
        
        Parameters:
        -----------
        compression_ratio : float (0-1)
            Ratio of compressed dimension to original dimension
        wavelet : str
            Wavelet type (default: 'haar')
        """
        self.compression_ratio = compression_ratio
        self.wavelet = wavelet
        
    def haar_transform(self, signal):
        """
        Apply Haar wavelet transform
        
        Parameters:
        -----------
        signal : np.array
            Original signal (length must be power of 2)
            
        Returns:
        --------
        coeffs : np.array
            Wavelet coefficients
        """
        n = len(signal)
        coeffs = np.zeros_like(signal, dtype=float)
        
        # Simple Haar transform implementation
        temp = signal.copy()
        length = n
        
        while length > 1:
            for i in range(length // 2):
                avg = (temp[2*i] + temp[2*i+1]) / np.sqrt(2)
                diff = (temp[2*i] - temp[2*i+1]) / np.sqrt(2)
                coeffs[i] = avg
                coeffs[length // 2 + i] = diff
            temp[:length] = coeffs[:length]
            length //= 2
            
        return coeffs
    
    def inverse_haar_transform(self, coeffs):
        """
        Apply inverse Haar wavelet transform
        
        Parameters:
        -----------
        coeffs : np.array
            Wavelet coefficients
            
        Returns:
        --------
        signal : np.array
            Reconstructed signal
        """
        n = len(coeffs)
        signal = coeffs.copy()
        length = 2
        
        while length <= n:
            temp = signal.copy()
            for i in range(length // 2):
                signal[2*i] = (temp[i] + temp[length // 2 + i]) / np.sqrt(2)
                signal[2*i+1] = (temp[i] - temp[length // 2 + i]) / np.sqrt(2)
            length *= 2
            
        return signal
    
    def compress(self, signal):
        """
        Compress signal using DWT and thresholding
        
        Parameters:
        -----------
        signal : np.array
            Original signal
            
        Returns:
        --------
        compressed_signal : np.array
            Compressed signal (thresholded coefficients)
        compression_mask : np.array
            Mask indicating which coefficients were kept
        """
        # Apply wavelet transform
        coeffs = self.haar_transform(signal)
        
        # Keep only largest coefficients based on compression ratio
        n_coeffs = len(coeffs)
        n_keep = int(n_coeffs * self.compression_ratio)
        
        # Get indices of largest absolute coefficients
        indices = np.argsort(np.abs(coeffs))[-n_keep:]
        compression_mask = np.zeros(n_coeffs, dtype=bool)
        compression_mask[indices] = True
        
        # Create compressed signal (only keep selected coefficients)
        compressed_coeffs = np.zeros_like(coeffs)
        compressed_coeffs[indices] = coeffs[indices]
        
        return compressed_coeffs, compression_mask
    
    def reconstruct(self, compressed_coeffs):
        """
        Reconstruct signal from compressed coefficients
        
        Parameters:
        -----------
        compressed_coeffs : np.array
            Compressed wavelet coefficients
            
        Returns:
        --------
        reconstructed_signal : np.array
            Reconstructed signal
        """
        # Apply inverse transform
        reconstructed_signal = self.inverse_haar_transform(compressed_coeffs)
        
        return reconstructed_signal

class HybridCompressor:
    """Hybrid Bernoulli + DWT Compression"""
    
    def __init__(self, compression_ratio=0.5, seed=42):
        """
        Initialize hybrid compressor
        
        Parameters:
        -----------
        compression_ratio : float (0-1)
            Overall compression ratio
        seed : int
            Random seed for Bernoulli matrix
        """
        self.compression_ratio = compression_ratio
        self.seed = seed
        self.bernoulli_compressor = BernoulliCompressor(compression_ratio, seed)
        self.dwt_compressor = DWTCompressor(1.0)  # No compression in DWT stage
        
    def compress(self, signal):
        """
        Apply DWT then Bernoulli compression
        
        Parameters:
        -----------
        signal : np.array
            Original signal
            
        Returns:
        --------
        compressed_signal : np.array
            Hybrid compressed signal
        """
        # Step 1: Apply DWT
        dwt_coeffs = self.dwt_compressor.haar_transform(signal)
        
        # Step 2: Apply Bernoulli compression on DWT coefficients
        compressed_signal = self.bernoulli_compressor.compress(dwt_coeffs)
        
        return compressed_signal
    
    def reconstruct(self, compressed_signal):
        """
        Reconstruct signal from hybrid compression
        
        Parameters:
        -----------
        compressed_signal : np.array
            Hybrid compressed signal
            
        Returns:
        --------
        reconstructed_signal : np.array
            Reconstructed signal
        """
        # Step 1: Reconstruct DWT coefficients
        dwt_coeffs = self.bernoulli_compressor.reconstruct_l1(compressed_signal)
        
        # Step 2: Apply inverse DWT
        reconstructed_signal = self.dwt_compressor.inverse_haar_transform(dwt_coeffs)
        
        return reconstructed_signal

# ============================================
# 4. Enhanced Feature Extraction with Compression
# ============================================

class CompressedFeatureExtractor(RobustMFCCExtractor):
    """
    Feature extractor with compression capabilities
    """
    
    def __init__(self, sr=22050, n_mfcc=40, n_fft=2048, hop_length=512):
        super().__init__(sr, n_mfcc, n_fft, hop_length)
        
    def extract_features_with_compression(self, audio_path, compression_method='bernoulli', 
                                          compression_ratio=0.5):
        """
        Extract features with optional compression
        
        Parameters:
        -----------
        audio_path : str
            Path to audio file
        compression_method : str
            'bernoulli', 'dwt', 'hybrid', or 'none'
        compression_ratio : float
            Compression ratio (0-1)
            
        Returns:
        --------
        features : np.array
            Extracted features (compressed or original)
        original_features : np.array
            Original features before compression
        compression_info : dict
            Compression metadata
        """
        # Extract original features
        original_features = self.extract_mfcc_features(audio_path)
        
        if compression_method == 'none' or compression_ratio >= 1.0:
            return original_features, original_features, {'method': 'none', 'ratio': 1.0}
        
        # Ensure features are 1D and have appropriate length
        features_1d = original_features.flatten()
        n_original = len(features_1d)
        
        # Make length a power of 2 for DWT
        if compression_method in ['dwt', 'hybrid']:
            n_padded = 2 ** int(np.ceil(np.log2(n_original)))
            features_1d = np.pad(features_1d, (0, n_padded - n_original), 'constant')
        
        compression_info = {
            'method': compression_method,
            'ratio': compression_ratio,
            'original_length': len(features_1d),
            'compressed_length': int(len(features_1d) * compression_ratio)
        }
        
        # Apply compression
        if compression_method == 'bernoulli':
            compressor = BernoulliCompressor(compression_ratio=compression_ratio)
            compressed_features = compressor.compress(features_1d)
            compression_info['compressor'] = compressor
            
        elif compression_method == 'dwt':
            compressor = DWTCompressor(compression_ratio=compression_ratio)
            compressed_coeffs, mask = compressor.compress(features_1d)
            compressed_features = compressed_coeffs[mask]
            compression_info['compressor'] = compressor
            compression_info['mask'] = mask
            
        elif compression_method == 'hybrid':
            compressor = HybridCompressor(compression_ratio=compression_ratio)
            compressed_features = compressor.compress(features_1d)
            compression_info['compressor'] = compressor
            
        else:
            raise ValueError(f"Unknown compression method: {compression_method}")
        
        return compressed_features, original_features, compression_info

# ============================================
# 5. Compression Experiment Pipeline
# ============================================

class CompressionExperiment:
    """
    Pipeline for compression and classification experiments
    """
    
    def __init__(self, dataset_path, output_dir='compression_results'):
        self.dataset_path = dataset_path
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        # Experiment configurations
        self.compression_methods = ['bernoulli', 'dwt', 'hybrid', 'none']
        self.compression_ratios = [0.25, 0.50, 0.75, 1.0]  # 1.0 = no compression
        self.classifiers = {
            'random_forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
            'svm': SVC(probability=True, random_state=42),
            'xgboost': XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        }
        
    def run_experiments(self, train_folds=list(range(1, 9)), test_folds=[9, 10]):
        """
        Run full compression-classification experiments
        
        Parameters:
        -----------
        train_folds : list
            Folds to use for training
        test_folds : list
            Folds to use for testing
        """
        print("="*70)
        print("COMPRESSION-CLASSIFICATION EXPERIMENTS")
        print("="*70)
        
        # Initialize processor and extractor
        processor = UrbanSound8KProcessor(self.dataset_path)
        extractor = CompressedFeatureExtractor()
        
        # Prepare data
        print("\nPreparing data...")
        train_paths, train_labels, _ = processor.prepare_data(train_folds)
        test_paths, test_labels, _ = processor.prepare_data(test_folds)
        
        # Encode labels
        label_encoder = LabelEncoder()
        y_train_full = label_encoder.fit_transform(train_labels)
        y_test_full = label_encoder.transform(test_labels)
        
        # Store all results
        all_results = []
        reconstruction_errors = []
        
        # Run experiments for each compression configuration
        for method in self.compression_methods:
            print(f"\n{'='*60}")
            print(f"Compression Method: {method.upper()}")
            print('='*60)
            
            for ratio in self.compression_ratios:
                print(f"\nCompression Ratio: {ratio*100:.0f}%")
                print("-"*40)
                
                # Extract features with compression
                X_train_compressed = []
                X_test_compressed = []
                X_train_original = []
                X_test_original = []
                
                # Process training data
                print("Processing training data...")
                for path in tqdm(train_paths, desc=f"Train {method} {ratio}"):
                    compressed_feat, original_feat, _ = extractor.extract_features_with_compression(
                        path, method, ratio
                    )
                    X_train_compressed.append(compressed_feat)
                    X_train_original.append(original_feat)
                
                # Process test data
                print("Processing test data...")
                for path in tqdm(test_paths, desc=f"Test {method} {ratio}"):
                    compressed_feat, original_feat, comp_info = extractor.extract_features_with_compression(
                        path, method, ratio
                    )
                    X_test_compressed.append(compressed_feat)
                    X_test_original.append(original_feat)
                
                # Convert to arrays
                X_train = np.array(X_train_compressed)
                X_test = np.array(X_test_compressed)
                X_train_orig = np.array(X_train_original)
                X_test_orig = np.array(X_test_original)
                
                # Pad sequences if necessary (for variable length from compression)
                if X_train.ndim == 1 or X_test.ndim == 1:
                    # Find max length
                    max_len = max(
                        max([len(x) for x in X_train]) if len(X_train) > 0 else 0,
                        max([len(x) for x in X_test]) if len(X_test) > 0 else 0
                    )
                    
                    # Pad sequences
                    X_train = np.array([np.pad(x, (0, max_len - len(x)), 'constant') 
                                      for x in X_train])
                    X_test = np.array([np.pad(x, (0, max_len - len(x)), 'constant') 
                                     for x in X_test])
                
                # Normalize features
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)
                
                # Calculate reconstruction error if not 'none'
                if method != 'none' and ratio < 1.0:
                    avg_error = self.calculate_reconstruction_error(
                        X_train_orig, X_train_compressed, method, ratio, comp_info
                    )
                    reconstruction_errors.append({
                        'method': method,
                        'ratio': ratio,
                        'reconstruction_error': avg_error
                    })
                
                # Train and evaluate classifiers
                for clf_name, clf in self.classifiers.items():
                    print(f"  Training {clf_name}...")
                    
                    # Train classifier
                    start_time = time.time()
                    clf.fit(X_train_scaled, y_train_full)
                    train_time = time.time() - start_time
                    
                    # Predict
                    start_time = time.time()
                    y_pred = clf.predict(X_test_scaled)
                    test_time = time.time() - start_time
                    
                    # Calculate metrics
                    accuracy = accuracy_score(y_test_full, y_pred)
                    precision = precision_score(y_test_full, y_pred, average='weighted')
                    recall = recall_score(y_test_full, y_pred, average='weighted')
                    f1 = f1_score(y_test_full, y_pred, average='weighted')
                    
                    # ROC-AUC if available
                    try:
                        if hasattr(clf, 'predict_proba'):
                            y_proba = clf.predict_proba(X_test_scaled)
                            roc_auc = roc_auc_score(y_test_full, y_proba, multi_class='ovr', average='weighted')
                        else:
                            roc_auc = np.nan
                    except:
                        roc_auc = np.nan
                    
                    # Store results
                    result = {
                        'compression_method': method,
                        'compression_ratio': ratio,
                        'classifier': clf_name,
                        'accuracy': accuracy,
                        'precision': precision,
                        'recall': recall,
                        'f1_score': f1,
                        'roc_auc': roc_auc,
                        'train_time': train_time,
                        'test_time': test_time,
                        'feature_dim_original': X_train_orig.shape[1] if len(X_train_orig.shape) > 1 else X_train_orig.shape[0],
                        'feature_dim_compressed': X_train.shape[1] if len(X_train.shape) > 1 else X_train.shape[0],
                        'compression_rate': (1 - ratio) * 100,
                        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    }
                    
                    all_results.append(result)
                    
                    print(f"    Accuracy: {accuracy:.4f}, F1: {f1:.4f}, Time: {train_time:.2f}s")
        
        # Save all results
        self.save_results(all_results, reconstruction_errors)
        
        # Generate comprehensive reports
        self.generate_reports(all_results)
        
        return all_results
    
    def calculate_reconstruction_error(self, original_features, compressed_features, 
                                      method, ratio, comp_info):
        """
        Calculate reconstruction error between original and compressed features
        """
        errors = []
        
        for orig, comp in zip(original_features, compressed_features):
            if method == 'bernoulli':
                # Reconstruct using compressor
                if 'compressor' in comp_info:
                    reconstructed = comp_info['compressor'].reconstruct_l1(comp[:int(len(orig) * ratio)])
                    # Pad or truncate to match original length
                    if len(reconstructed) < len(orig):
                        reconstructed = np.pad(reconstructed, (0, len(orig) - len(reconstructed)), 'constant')
                    elif len(reconstructed) > len(orig):
                        reconstructed = reconstructed[:len(orig)]
                    
                    # Calculate error
                    error = np.mean((orig - reconstructed) ** 2)
                    errors.append(error)
            
            elif method == 'dwt':
                if 'compressor' in comp_info and 'mask' in comp_info:
                    # Reconstruct compressed coefficients
                    full_coeffs = np.zeros(len(orig))
                    full_coeffs[comp_info['mask']] = comp
                    reconstructed = comp_info['compressor'].reconstruct(full_coeffs)
                    
                    # Calculate error
                    error = np.mean((orig - reconstructed) ** 2)
                    errors.append(error)
        
        return np.mean(errors) if errors else np.nan
    
    def save_results(self, all_results, reconstruction_errors):
        """
        Save experiment results to files
        """
        # Save main results
        results_df = pd.DataFrame(all_results)
        results_df.to_csv(os.path.join(self.output_dir, 'compression_results.csv'), index=False)
        
        # Save reconstruction errors
        if reconstruction_errors:
            recon_df = pd.DataFrame(reconstruction_errors)
            recon_df.to_csv(os.path.join(self.output_dir, 'reconstruction_errors.csv'), index=False)
        
        # Save summary statistics
        summary = self.create_summary_statistics(results_df)
        with open(os.path.join(self.output_dir, 'summary.json'), 'w') as f:
            json.dump(summary, f, indent=2)
        
        print(f"\nResults saved to {self.output_dir}/")
    
    def create_summary_statistics(self, results_df):
        """
        Create comprehensive summary statistics
        """
        summary = {
            'experiment_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'total_experiments': len(results_df),
            'compression_methods_tested': list(results_df['compression_method'].unique()),
            'compression_ratios_tested': list(results_df['compression_ratio'].unique()),
            'classifiers_tested': list(results_df['classifier'].unique()),
        }
        
        # Best results by compression method
        best_by_method = {}
        for method in summary['compression_methods_tested']:
            method_df = results_df[results_df['compression_method'] == method]
            if not method_df.empty:
                best_idx = method_df['accuracy'].idxmax()
                best_by_method[method] = method_df.loc[best_idx].to_dict()
        
        summary['best_by_method'] = best_by_method
        
        # Compression vs Accuracy analysis
        compression_analysis = {}
        for ratio in summary['compression_ratios_tested']:
            ratio_df = results_df[results_df['compression_ratio'] == ratio]
            if not ratio_df.empty:
                compression_analysis[f'ratio_{ratio}'] = {
                    'avg_accuracy': ratio_df['accuracy'].mean(),
                    'avg_f1': ratio_df['f1_score'].mean(),
                    'avg_feature_dim': ratio_df['feature_dim_compressed'].mean(),
                    'compression_rate': (1 - ratio) * 100
                }
        
        summary['compression_analysis'] = compression_analysis
        
        # Overall best configuration
        overall_best_idx = results_df['accuracy'].idxmax()
        summary['overall_best'] = results_df.loc[overall_best_idx].to_dict()
        
        return summary
    
    def generate_reports(self, all_results):
        """
        Generate visual reports and analysis
        """
        results_df = pd.DataFrame(all_results)
        
        # 1. Accuracy vs Compression Ratio plot
        plt.figure(figsize=(12, 8))
        
        for method in results_df['compression_method'].unique():
            method_df = results_df[results_df['compression_method'] == method]
            for clf in results_df['classifier'].unique():
                clf_df = method_df[method_df['classifier'] == clf]
                if not clf_df.empty:
                    plt.plot(clf_df['compression_ratio'], clf_df['accuracy'], 
                            marker='o', label=f'{method}-{clf}')
        
        plt.xlabel('Compression Ratio', fontsize=12)
        plt.ylabel('Accuracy', fontsize=12)
        plt.title('Classification Accuracy vs Compression Ratio', fontsize=14)
        plt.grid(True, alpha=0.3)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'accuracy_vs_compression.png'), dpi=300)
        plt.show()
        
        # 2. Feature Dimension Reduction plot
        plt.figure(figsize=(10, 6))
        
        unique_ratios = sorted(results_df['compression_ratio'].unique())
        avg_dims = []
        
        for ratio in unique_ratios:
            ratio_df = results_df[results_df['compression_ratio'] == ratio]
            avg_dim = ratio_df['feature_dim_compressed'].mean()
            avg_dims.append(avg_dim)
        
        plt.plot(unique_ratios, avg_dims, 'b-o', linewidth=2)
        plt.xlabel('Compression Ratio', fontsize=12)
        plt.ylabel('Average Feature Dimension', fontsize=12)
        plt.title('Feature Dimension Reduction', fontsize=14)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'feature_dimension_reduction.png'), dpi=300)
        plt.show()
        
        # 3. Heatmap of accuracy by method and ratio
        plt.figure(figsize=(10, 8))
        
        # Create pivot table
        pivot_table = results_df.pivot_table(
            values='accuracy',
            index='compression_method',
            columns='compression_ratio',
            aggfunc='mean'
        )
        
        sns.heatmap(pivot_table, annot=True, fmt='.3f', cmap='YlOrRd', 
                   cbar_kws={'label': 'Accuracy'})
        plt.title('Average Accuracy by Compression Method and Ratio', fontsize=14)
        plt.xlabel('Compression Ratio', fontsize=12)
        plt.ylabel('Compression Method', fontsize=12)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'accuracy_heatmap.png'), dpi=300)
        plt.show()
        
        # 4. Training Time Comparison
        plt.figure(figsize=(12, 6))
        
        for clf in results_df['classifier'].unique():
            clf_df = results_df[results_df['classifier'] == clf]
            for method in results_df['compression_method'].unique():
                method_clf_df = clf_df[clf_df['compression_method'] == method]
                if not method_clf_df.empty:
                    plt.plot(method_clf_df['compression_ratio'], 
                            method_clf_df['train_time'], 
                            marker='s', label=f'{method}-{clf}')
        
        plt.xlabel('Compression Ratio', fontsize=12)
        plt.ylabel('Training Time (seconds)', fontsize=12)
        plt.title('Training Time vs Compression Ratio', fontsize=14)
        plt.grid(True, alpha=0.3)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'training_time.png'), dpi=300)
        plt.show()
        
        # 5. Create detailed report HTML
        self.create_html_report(results_df)
    
    def create_html_report(self, results_df):
        """
        Create HTML report of experiment results
        """
        html_content = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>Compression-Classification Experiment Report</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 40px; }}
                h1, h2, h3 {{ color: #333; }}
                table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
                th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
                th {{ background-color: #f2f2f2; }}
                tr:nth-child(even) {{ background-color: #f9f9f9; }}
                .best {{ background-color: #d4edda; }}
                .summary {{ background-color: #f8f9fa; padding: 20px; border-radius: 5px; }}
                .image {{ max-width: 100%; height: auto; margin: 20px 0; }}
            </style>
        </head>
        <body>
            <h1>Audio Compression-Classification Experiment Report</h1>
            <p>Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
            
            <div class="summary">
                <h2>Experiment Summary</h2>
                <p><strong>Total Experiments:</strong> {len(results_df)}</p>
                <p><strong>Compression Methods:</strong> {', '.join(results_df['compression_method'].unique())}</p>
                <p><strong>Compression Ratios:</strong> {', '.join([str(r) for r in sorted(results_df['compression_ratio'].unique())])}</p>
                <p><strong>Classifiers:</strong> {', '.join(results_df['classifier'].unique())}</p>
            </div>
            
            <h2>Overall Best Configuration</h2>
            <table>
                <tr>
                    <th>Metric</th>
                    <th>Value</th>
                </tr>
        """
        
        # Find overall best
        best_idx = results_df['accuracy'].idxmax()
        best_row = results_df.loc[best_idx]
        
        for metric in ['compression_method', 'compression_ratio', 'classifier', 
                      'accuracy', 'precision', 'recall', 'f1_score', 'roc_auc',
                      'train_time', 'test_time', 'feature_dim_compressed']:
            html_content += f"""
                <tr>
                    <td>{metric.replace('_', ' ').title()}</td>
                    <td>{best_row[metric]:.4f if isinstance(best_row[metric], float) else best_row[metric]}</td>
                </tr>
            """
        
        html_content += """
            </table>
            
            <h2>Top 10 Performances</h2>
            <table>
                <tr>
                    <th>Rank</th>
                    <th>Method</th>
                    <th>Ratio</th>
                    <th>Classifier</th>
                    <th>Accuracy</th>
                    <th>F1 Score</th>
                    <th>Feature Dim</th>
                </tr>
        """
        
        # Sort by accuracy and get top 10
        top_results = results_df.sort_values('accuracy', ascending=False).head(10)
        for idx, (_, row) in enumerate(top_results.iterrows(), 1):
            html_content += f"""
                <tr class="{'best' if idx == 1 else ''}">
                    <td>{idx}</td>
                    <td>{row['compression_method']}</td>
                    <td>{row['compression_ratio']}</td>
                    <td>{row['classifier']}</td>
                    <td>{row['accuracy']:.4f}</td>
                    <td>{row['f1_score']:.4f}</td>
                    <td>{int(row['feature_dim_compressed'])}</td>
                </tr>
            """
        
        html_content += """
            </table>
            
            <h2>Visualizations</h2>
            <p>The following visualizations have been generated:</p>
            <ul>
                <li>accuracy_vs_compression.png - Accuracy vs Compression Ratio</li>
                <li>feature_dimension_reduction.png - Feature Dimension Reduction</li>
                <li>accuracy_heatmap.png - Accuracy Heatmap</li>
                <li>training_time.png - Training Time Analysis</li>
            </ul>
            
            <h2>Detailed Results Table</h2>
            <table>
                <tr>
                    <th>Method</th>
                    <th>Ratio</th>
                    <th>Classifier</th>
                    <th>Accuracy</th>
                    <th>Precision</th>
                    <th>Recall</th>
                    <th>F1</th>
                    <th>Train Time</th>
                    <th>Test Time</th>
                </tr>
        """
        
        for _, row in results_df.iterrows():
            html_content += f"""
                <tr>
                    <td>{row['compression_method']}</td>
                    <td>{row['compression_ratio']}</td>
                    <td>{row['classifier']}</td>
                    <td>{row['accuracy']:.4f}</td>
                    <td>{row['precision']:.4f}</td>
                    <td>{row['recall']:.4f}</td>
                    <td>{row['f1_score']:.4f}</td>
                    <td>{row['train_time']:.2f}s</td>
                    <td>{row['test_time']:.2f}s</td>
                </tr>
            """
        
        html_content += """
            </table>
            
            <footer>
                <p>Experiment conducted using UrbanSound8K dataset</p>
                <p>Compression methods: Bernoulli, DWT (Haar), Hybrid</p>
            </footer>
        </body>
        </html>
        """
        
        # Save HTML report
        with open(os.path.join(self.output_dir, 'experiment_report.html'), 'w') as f:
            f.write(html_content)
        
        print(f"\nHTML report generated: {self.output_dir}/experiment_report.html")

# ============================================
# 6. Main Execution
# ============================================

def main():
    """
    Main execution function
    """
    print("="*70)
    print("AUDIO COMPRESSION-CLASSIFICATION EXPERIMENTAL PIPELINE")
    print("="*70)
    
    # Initialize paths
    DATASET_PATH = "UrbanSound8K"
    
    # Check if dataset exists
    if not os.path.exists(DATASET_PATH):
        print(f"ERROR: Dataset not found at {DATASET_PATH}")
        print("\nPlease download UrbanSound8K dataset from:")
        print("https://urbansounddataset.weebly.com/urbansound8k.html")
        print("\nExtract it to the current directory as 'UrbanSound8K'")
        return
    
    # Create experiment instance
    experiment = CompressionExperiment(DATASET_PATH)
    
    # Run experiments
    results = experiment.run_experiments()
    
    # Print summary
    print("\n" + "="*70)
    print("EXPERIMENT COMPLETED SUCCESSFULLY!")
    print("="*70)
    
    # Load and display summary
    summary_path = os.path.join('compression_results', 'summary.json')
    if os.path.exists(summary_path):
        with open(summary_path, 'r') as f:
            summary = json.load(f)
        
        print(f"\nTotal experiments conducted: {summary['total_experiments']}")
        
        # Show overall best
        best = summary['overall_best']
        print("\nOVERALL BEST CONFIGURATION:")
        print(f"  Method: {best['compression_method']}")
        print(f"  Ratio: {best['compression_ratio']}")
        print(f"  Classifier: {best['classifier']}")
        print(f"  Accuracy: {best['accuracy']:.4f}")
        print(f"  Feature Dimension: {int(best['feature_dim_compressed'])}")
        print(f"  Compression Rate: {best['compression_rate']:.1f}%")
        
        # Show best by method
        print("\nBEST BY COMPRESSION METHOD:")
        for method, config in summary['best_by_method'].items():
            print(f"  {method}: {config['accuracy']:.4f} "
                  f"(Ratio: {config['compression_ratio']}, "
                  f"Classifier: {config['classifier']})")

def quick_test():
    """
    Quick test to verify compression algorithms
    """
    print("Running quick compression test...")
    
    # Generate test signal
    np.random.seed(42)
    test_signal = np.random.randn(1024)
    
    print(f"Original signal shape: {test_signal.shape}")
    
    # Test Bernoulli compression
    print("\n1. Bernoulli Compression (50%):")
    bernoulli = BernoulliCompressor(compression_ratio=0.5)
    compressed = bernoulli.compress(test_signal)
    print(f"   Compressed shape: {compressed.shape}")
    print(f"   Compression: {len(compressed)/len(test_signal)*100:.1f}% of original")
    
    # Test DWT compression
    print("\n2. DWT (Haar) Compression (50%):")
    dwt = DWTCompressor(compression_ratio=0.5)
    compressed_coeffs, mask = dwt.compress(test_signal)
    print(f"   Compressed coefficients: {np.sum(mask)}")
    print(f"   Compression: {np.sum(mask)/len(test_signal)*100:.1f}% of original")
    
    # Test hybrid compression
    print("\n3. Hybrid Compression (50%):")
    hybrid = HybridCompressor(compression_ratio=0.5)
    hybrid_compressed = hybrid.compress(test_signal)
    print(f"   Compressed shape: {hybrid_compressed.shape}")
    print(f"   Compression: {len(hybrid_compressed)/len(test_signal)*100:.1f}% of original")
    
    print("\nQuick test completed successfully!")

if __name__ == "__main__":
    print("Audio Compression-Classification Pipeline")
    print("="*70)
    
    # First run quick test
    quick_test()
    
    # Ask user if they want to run full experiments
    response = input("\nDo you want to run full compression-classification experiments? (yes/no): ")
    if response.lower() in ['yes', 'y']:
        main()
    else:
        print("\nQuick test completed. Run full experiments when ready.")

Audio Compression-Classification Pipeline
Running quick compression test...
Original signal shape: (1024,)

1. Bernoulli Compression (50%):
   Compressed shape: (512,)
   Compression: 50.0% of original

2. DWT (Haar) Compression (50%):
   Compressed coefficients: 512
   Compression: 50.0% of original

3. Hybrid Compression (50%):
   Compressed shape: (512,)
   Compression: 50.0% of original

Quick test completed successfully!



Do you want to run full compression-classification experiments? (yes/no):  yes


AUDIO COMPRESSION-CLASSIFICATION EXPERIMENTAL PIPELINE
COMPRESSION-CLASSIFICATION EXPERIMENTS

Preparing data...
Found 7079 valid audio files out of 6448 expected
Found 1653 valid audio files out of 1674 expected

Compression Method: BERNOULLI

Compression Ratio: 25%
----------------------------------------
Processing training data...


Train bernoulli 0.25:   5%|██▊                                                      | 345/7079 [00:50<13:13,  8.48it/s]