# 01 - Data Exploration & Preparation

This notebook loads the audio dataset, explores its statistics, creates DataFrames, and balances the dataset for model training.

## Imports and Setup

In [4]:
import os
import librosa
import numpy as np
from pathlib import Path
import warnings
import pandas as pd
from collections import Counter

warnings.filterwarnings('ignore')

# Set up paths
processed_folder = Path('../ml_data/processed')
print(f"Loading from: {processed_folder}")

Loading from: ../ml_data/processed


## Load Audio Dataset

In [5]:
myVoices = []
failed_files = []

print("Loading audio dataset...")
print(f"Source: {processed_folder}\n")

for category_folder in sorted(processed_folder.iterdir()):
    if category_folder.is_dir():
        category = category_folder.name
        audio_files = list(category_folder.glob('*.wav'))
        
        print(f"Loading {category:20s}: {len(audio_files):4d} files", end=" ... ")
        
        success_count = 0
        for audio_path in audio_files:
            try:
                # Load audio with librosa (sr=None keeps original sample rate)
                waveform, sample_rate = librosa.load(str(audio_path), sr=None)
                myVoices.append({
                    'path': str(audio_path),
                    'filename': audio_path.name,
                    'waveform': waveform,
                    'sample_rate': sample_rate,
                    'category': category,
                    'duration': len(waveform) / sample_rate
                })
                success_count += 1
            except Exception as e:
                failed_files.append((category, audio_path.name, str(e)))
        
        print(f"✓ {success_count}")

print("✓ Audio loading complete!")

Loading audio dataset...
Source: ../ml_data/processed

Loading bike                :  654 files ... ✓ 654
Loading bus                 : 2772 files ... ✓ 2772
Loading car                 : 1695 files ... ✓ 1695
Loading cng_auto            :  508 files ... ✓ 508
Loading construction_noise  :  574 files ... ✓ 574
Loading protest             :  879 files ... ✓ 879
Loading siren               :  552 files ... ✓ 552
Loading traffic_jam         :  517 files ... ✓ 517
Loading train               :  899 files ... 

KeyboardInterrupt: 

## Display Dataset Statistics

In [None]:
print(f"\n{'='*60}")
print(f"DATASET SUMMARY")
print(f"{'='*60}")
print(f"Total files loaded: {len(myVoices)}")

if failed_files:
    print(f"Failed to load: {len(failed_files)} files")
    for cat, fname, error in failed_files[:5]:  # Show first 5 errors
        print(f"  - {cat}/{fname}: {error}")

# Show distribution
print(f"\nCategory Distribution:")
for cat, count in sorted(Counter([v['category'] for v in myVoices]).items()):
    print(f"  {cat:20s}: {count:3d} files")

# Show sample rate info
print(f"\nSample Rate Distribution:")
sample_rates = Counter([v['sample_rate'] for v in myVoices])
for sr, count in sorted(sample_rates.items()):
    print(f"  {sr:6d} Hz: {count:4d} files")

# Duration statistics
total_duration = sum(v['duration'] for v in myVoices)
avg_duration = total_duration / len(myVoices) if myVoices else 0
print(f"\nDuration Statistics:")
print(f"  Total: {total_duration/60:.2f} minutes")
print(f"  Average per file: {avg_duration:.2f} seconds")
print(f"{'='*60}")

## Create DataFrame

In [None]:
# Create a list to hold the data for the DataFrame
df_data = []

# Iterate through the myVoices list
for audio_item in myVoices:
    audio_path = audio_item['path']
    waveform = audio_item['waveform']
    sample_rate = audio_item['sample_rate']
    category = audio_item['category']

    df_data.append({
        'audio_path': audio_path,
        'waveform': waveform,
        'sample_rate': sample_rate,
        'category': category
    })

# Create the pandas DataFrame
myVoices_df = pd.DataFrame(df_data)

# Display the first few rows of the DataFrame
print("DataFrame Head:")
display(myVoices_df.head())

# Display the value counts for the 'category' column
print("\nCategory distribution:")
display(myVoices_df['category'].value_counts())

## Balance Dataset

In [None]:
if 'myVoices_df' in globals() and not myVoices_df.empty:
    # Find the minimum number of samples across all categories
    min_samples = myVoices_df['category'].value_counts().min()

    # Create an empty list to store the sampled dataframes
    balanced_df_list = []

    # Group by category and sample 'min_samples' from each group
    for category_name, group_df in myVoices_df.groupby('category'):
        # Use .sample() to randomly select 'min_samples' from the current category's dataframe
        sampled_group = group_df.sample(n=min_samples, random_state=42)
        balanced_df_list.append(sampled_group)

    # Concatenate the sampled dataframes back into a single dataframe
    balanced_myVoices_df = pd.concat(balanced_df_list).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Display the head and the new category distribution of the balanced DataFrame
    print("Balanced DataFrame Head:")
    display(balanced_myVoices_df.head())

    print("\nBalanced Category Distribution:")
    display(balanced_myVoices_df['category'].value_counts())

    print(f"\nOriginal number of samples: {len(myVoices_df)}")
    print(f"Number of samples in the balanced DataFrame: {len(balanced_myVoices_df)}")

else:
    print("The 'myVoices_df' DataFrame is not available or is empty.")

## Calculate Audio Durations

In [None]:
# Calculate audio durations in seconds and extract sample rates
audio_durations = []
audio_sample_rates = []

for item in myVoices:
    waveform = item['waveform']
    sample_rate = item['sample_rate']

    # Duration = number of samples / sample rate
    duration = waveform.shape[-1] / sample_rate
    audio_durations.append(duration)
    audio_sample_rates.append(sample_rate)

print("Duration Statistics (first 10 files):")
print(f"Audio Durations (seconds): {audio_durations[:10]}")
print(f"Audio Sample Rates: {audio_sample_rates[:10]}")
print(f"\nMin duration: {min(audio_durations):.2f}s")
print(f"Max duration: {max(audio_durations):.2f}s")
print(f"Mean duration: {np.mean(audio_durations):.2f}s")