# Voice-Controlled LED: Model Training Notebook

This notebook consolidates the entire training pipeline—preprocessing, model architecture, and training—into a single file for easy execution on platforms like Google Colab.


## Step 1: Setup and Dependencies

First, we'll install the necessary libraries that are not typically included in Colab's default environment and import all required modules.


In [None]:
!pip install librosa soundfile


In [None]:
import numpy as np
import librosa
import soundfile as sf
from pathlib import Path
from typing import Tuple, Optional
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
import matplotlib.pyplot as plt


## Step 2: Audio Preprocessing and Dataset Loading

This section contains all the classes needed for loading audio, applying preprocessing steps, extracting features, and wrapping everything in a PyTorch `Dataset`.


In [None]:
# --- Preprocessing Classes ---

class AudioPreprocessor:
    """Base class for audio preprocessing."""
    def __init__(self, sample_rate: int = 16000):
        self.sample_rate = sample_rate
    
    def load_audio(self, file_path: str) -> np.ndarray:
        """Load audio file and resample if necessary."""
        audio, sr = librosa.load(file_path, sr=self.sample_rate)
        return audio
    
    def normalize(self, audio: np.ndarray) -> np.ndarray:
        """Normalize audio signal to [-1, 1] range."""
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            return audio / max_val
        return audio
    
    def trim_silence(self, audio: np.ndarray, top_db: int = 20) -> np.ndarray:
        """Trim silence from beginning and end of audio."""
        trimmed, _ = librosa.effects.trim(audio, top_db=top_db)
        return trimmed

class VoicePreprocessor(AudioPreprocessor):
    """Master preprocessing pipeline for the project."""
    def __init__(self, sample_rate: int = 16000):
        super().__init__(sample_rate)
    
    def preprocess(self, audio: np.ndarray) -> np.ndarray:
        """Main preprocessing function."""
        audio = self.normalize(audio)
        audio = self.trim_silence(audio)
        # Add other preprocessing steps like filtering here if needed
        return audio
    
    def extract_features(self, audio: np.ndarray) -> np.ndarray:
        """Extract MFCC features from audio."""
        mfccs = librosa.feature.mfcc(
            y=audio,
            sr=self.sample_rate,
            n_mfcc=13,
            n_fft=2048,
            hop_length=512
        )
        return mfccs

# --- Dataset Classes ---

class AudioDataset(Dataset):
    """Base dataset class for loading audio files."""
    def __init__(self, data_dir: str, preprocessor: AudioPreprocessor, transform=None):
        self.data_dir = Path(data_dir)
        self.preprocessor = preprocessor
        self.transform = transform
        self.samples = []
        self.labels = []
        self._load_samples()
    
    def _load_samples(self):
        """Load all audio samples and their labels."""
        classes = ['pi_on', 'pi_off', 'background']
        label_map = {cls: idx for idx, cls in enumerate(classes)}
        
        for class_name in classes:
            class_dir = self.data_dir / class_name
            if class_dir.exists():
                for audio_file in class_dir.glob('*.wav'):
                    self.samples.append(str(audio_file))
                    self.labels.append(label_map[class_name])
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        audio = self.preprocessor.load_audio(self.samples[idx])
        label = self.labels[idx]
        
        if self.transform:
            audio = self.transform(audio)
        
        return audio, label

class VoiceDataset(AudioDataset):
    """Master dataset class that applies the full preprocessing pipeline."""
    def __init__(self, data_dir: str, preprocessor: VoicePreprocessor):
        super().__init__(data_dir, preprocessor)
        self.preprocessor = preprocessor
    
    def __getitem__(self, idx):
        audio, label = super().__getitem__(idx)
        
        # Apply custom preprocessing
        processed_audio = self.preprocessor.preprocess(audio)
        features = self.preprocessor.extract_features(processed_audio)
        
        # Convert to tensor
        features_tensor = torch.FloatTensor(features)
        
        return features_tensor, label


## Step 3: Defining the Neural Network Architecture

Here, we define the PyTorch `nn.Module` that will serve as our voice command classifier.


In [None]:
class VoiceClassifier(nn.Module):
    """
    Neural network model for classifying voice commands.
    
    Args:
        input_size: Size of input features (e.g., flattened MFCC or spectrogram)
        num_classes: Number of output classes (default: 3 for pi_on, pi_off, background)
    """
    def __init__(self, input_size: int, num_classes: int = 3):
        super(VoiceClassifier, self).__init__()
        
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.3)
        
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.3)
        
        self.fc3 = nn.Linear(64, 32)
        self.dropout3 = nn.Dropout(0.2)
        
        self.fc4 = nn.Linear(32, num_classes)
        
    def forward(self, x):
        """
        Forward pass through the network. Expects input of shape (batch_size, n_mfcc, time_steps)
        """
        # Flatten the input features
        x = x.view(x.size(0), -1)
        
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        
        x = self.fc4(x)
        
        return x


## Step 4: Training Pipeline

This section includes the functions for a single training epoch and validation, followed by the main training loop logic.


In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    
    for data, target in dataloader:
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += pred.eq(target).sum().item()
        total += target.size(0)
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': 100. * correct / total
    }

def validate(model, dataloader, criterion, device):
    """Validate model."""
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            
            total_loss += loss.item()
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
            total += target.size(0)
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': 100. * correct / total
    }


## Step 5: Configuration and Execution

Here we set the hyperparameters, define data paths, and run the complete training process.

**ACTION REQUIRED:**
1.  Upload your `data` directory (containing `pi_on`, `pi_off`, `background` subfolders) to the Colab session, or mount your Google Drive.
2.  Adjust the `DATA_DIR` variable below to point to the correct path.


In [None]:
# --- Configuration ---
DATA_DIR = 'data'  # IMPORTANT: Change this to your data directory path
MODEL_SAVE_PATH = 'best_model.pt'
EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.001
VALIDATION_SPLIT = 0.2


In [None]:
# --- Data Padding ---
def pad_collate(batch):
    """Pads audio features to the max length in a batch."""
    (features, labels) = zip(*batch)
    
    # Get sequence lengths
    lengths = [f.shape[1] for f in features]
    max_len = max(lengths)
    
    # Pad features
    padded_features = torch.zeros(len(features), features[0].shape[0], max_len)
    for i, f in enumerate(features):
        padded_features[i, :, :f.shape[1]] = f
        
    labels = torch.LongTensor(labels)
    
    return padded_features, labels

# --- Main Execution ---

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Check if data directory exists
if not os.path.isdir(DATA_DIR):
    print(f"Error: Data directory '{DATA_DIR}' not found.")
    print("Please upload your data or mount your Google Drive and update the DATA_DIR variable.")
else:
    # Initialize preprocessing and dataset
    preprocessor = VoicePreprocessor(sample_rate=16000)
    dataset = VoiceDataset(DATA_DIR, preprocessor)
    print(f"Total dataset size: {len(dataset)}")
    
    # Split dataset
    val_size = int(len(dataset) * VALIDATION_SPLIT)
    train_size = len(dataset) - val_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_collate)
    print(f"Train samples: {train_size}, Validation samples: {val_size}")

    # Determine input size for the model from the first batch
    # This makes the model adaptable to different audio lengths/feature sizes
    try:
        temp_features, _ = next(iter(train_loader))
        input_size = temp_features.shape[1] * temp_features.shape[2]
        print(f"Determined input size for the model: {input_size}")
        
        # Initialize model
        model = VoiceClassifier(input_size=input_size, num_classes=3).to(device)
        
        # Setup training
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        
        # Training loop
        best_val_acc = 0.0
        print(f"\nStarting training for {EPOCHS} epochs...")
        print("=" * 60)
        
        for epoch in range(EPOCHS):
            train_metrics = train_epoch(model, train_loader, criterion, optimizer, device)
            val_metrics = validate(model, val_loader, criterion, device)
            
            print(f"Epoch {epoch+1}/{EPOCHS} - "
                  f"Train Loss: {train_metrics['loss']:.4f}, "
                  f"Train Acc: {train_metrics['accuracy']:.2f}%, "
                  f"Val Loss: {val_metrics['loss']:.4f}, "
                  f"Val Acc: {val_metrics['accuracy']:.2f}%")
            
            if val_metrics['accuracy'] > best_val_acc:
                best_val_acc = val_metrics['accuracy']
                torch.save(model.state_dict(), MODEL_SAVE_PATH)
                print(f"  → Saved best model (val acc: {best_val_acc:.2f}%)")
        
        print("=" * 60)
        print(f"Training complete! Best validation accuracy: {best_val_acc:.2f}%")
        print(f"Model saved to: {MODEL_SAVE_PATH}")

    except StopIteration:
        print("Error: The data loader is empty. This might mean your data directory is empty or misconfigured.")
