# 02 - Feature Extraction (MFCC)

Extract MFCC features from audio files and prepare data for model training.

## Imports

In [None]:
import torchaudio.transforms as T
import torch
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

print("✓ All imports complete")

## Define MFCC Feature Extraction Function

In [None]:
def extract_mfcc_features(waveform, sample_rate, n_mfcc=40, n_fft=400, hop_length=160, max_padding=176400):
    """
    Extracts MFCC features from an audio waveform.

    Args:
        waveform (numpy.ndarray or torch.Tensor): The audio waveform.
        sample_rate (int): The sample rate of the waveform.
        n_mfcc (int): Number of MFCC coefficients to retain.
        n_fft (int): Size of the FFT window.
        hop_length (int): Number of samples between successive frames.
        max_padding (int): Maximum length to pad or truncate the waveform to.

    Returns:
        torch.Tensor: The MFCC features.
    """
    # Convert numpy array to torch tensor if needed
    if isinstance(waveform, np.ndarray):
        waveform = torch.from_numpy(waveform).float()
    
    # Ensure waveform is 2D: (channels, samples)
    if waveform.ndim == 1:
        waveform = waveform.unsqueeze(0)
    
    # Ensure waveform is mono if it has multiple channels
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    # Resample if sample rate is not 16000
    if sample_rate != 16000:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    # Pad or truncate the waveform to a fixed length
    if waveform.shape[-1] < max_padding:
        padding = max_padding - waveform.shape[-1]
        waveform = torch.nn.functional.pad(waveform, (0, padding))
    elif waveform.shape[-1] > max_padding:
        waveform = waveform[..., :max_padding]

    # Extract MFCC
    mfcc_transform = T.MFCC(
        sample_rate=sample_rate,
        n_mfcc=n_mfcc,
        melkwargs={
            "n_fft": n_fft,
            "hop_length": hop_length,
            "n_mels": 128,
            "f_min": 0,
            "f_max": sample_rate / 2,
        },
    )
    mfcc_features = mfcc_transform(waveform)
    return mfcc_features.squeeze(0)

print("✓ MFCC extraction function defined")

## Extract Features from All Audio Files

In [None]:
# List to store extracted features and labels
extracted_features = []

# Iterate through the myVoices list and extract features
print("Extracting MFCC features...")
for i, audio_item in enumerate(myVoices):
    try:
        waveform = audio_item['waveform']
        sample_rate = audio_item['sample_rate']
        category = audio_item['category']

        # Extract MFCC features
        features = extract_mfcc_features(waveform, sample_rate)

        # Append features and category to the list
        extracted_features.append({
            'features': features,
            'category': category
        })
        
        if (i + 1) % 100 == 0:
            print(f"  Processed {i + 1}/{len(myVoices)} files")
            
    except Exception as e:
        print(f"Could not extract features for {audio_item.get('path', 'an audio file')}: {e}")
        continue

print(f"✓ Feature extraction complete! Extracted {len(extracted_features)} features")

## Create Features DataFrame

In [None]:
# Create a DataFrame from the extracted features and labels
extracted_features_df = pd.DataFrame(extracted_features)

# Display the first few rows of the DataFrame and its shape
print("Extracted Features DataFrame:")
display(extracted_features_df.head())
print(f"\nShape of the extracted features DataFrame: {extracted_features_df.shape}")
print(f"Feature shape per sample: {extracted_features_df['features'].iloc[0].shape}")
print(f"\nCategory distribution:")
print(extracted_features_df['category'].value_counts())

## Shuffle Data

In [None]:
# Shuffle the data
extracted_features_df_shuffled = extracted_features_df.sample(
    frac=1.0, 
    random_state=42
).reset_index(drop=True)

print("✓ Data shuffled")
print(f"Original class distribution:")
print(extracted_features_df_shuffled['category'].value_counts())

## Train/Validation/Test Split

In [None]:
# Extract features and labels
X = extracted_features_df_shuffled['features']
y = extracted_features_df_shuffled['category']

# Convert list of tensors to NumPy array
if all(item.shape == X.iloc[0].shape for item in X):
    X = torch.stack(list(X)).numpy()
else:
    raise ValueError("Feature tensor shapes are inconsistent, cannot stack.")

print(f"Stacked feature shape: {X.shape}")

# First split - Train vs Rest
X_train_raw, X_temp, y_train_raw, y_temp = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\n✓ First split done")
print(f"Train (raw): {len(X_train_raw)} samples")
print(f"Temp: {len(X_temp)} samples")

# Second split - Val vs Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    random_state=42, 
    stratify=y_temp
)

print(f"\n✓ Second split done")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")

## Balance Training Set

In [None]:
print(f"Training set BEFORE balancing:")
print(pd.Series(y_train_raw).value_counts())

# Find minimum class count
class_counts = Counter(y_train_raw)
min_samples = min(class_counts.values())

# Undersample each class to min_samples
balanced_indices = []
for class_label in class_counts.keys():
    class_indices = np.where(y_train_raw == class_label)[0]
    sampled_indices = np.random.choice(
        class_indices, 
        size=min_samples, 
        replace=False
    )
    balanced_indices.extend(sampled_indices)

# Shuffle balanced indices
np.random.shuffle(balanced_indices)

# Create balanced training set
X_train = X_train_raw[balanced_indices]
y_train = y_train_raw.iloc[balanced_indices].reset_index(drop=True)

print(f"\nTraining set AFTER undersampling:")
print(pd.Series(y_train).value_counts())

# Final summary
print(f"\n{'='*60}")
print(f"FINAL DATASET SPLITS")
print(f"{'='*60}")
print(f"Training set:   {X_train.shape[0]} samples (BALANCED)")
print(f"Validation set: {X_val.shape[0]} samples (ORIGINAL)")
print(f"Test set:       {X_test.shape[0]} samples (ORIGINAL)")
print(f"{'='*60}")