In [25]:
"""
Wake Word Detection Data Generation Pipeline

This module generates training data (activates, negatives, backgrounds) 
for wake-word detection using the ElevenLabs TTS API.
"""

import os
import glob
import json
import logging
import random
import time
from pathlib import Path
from typing import List, Dict, Tuple
from dataclasses import dataclass, asdict
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
from pydub import AudioSegment
from dotenv import load_dotenv
import IPython
from IPython.display import Audio, display

In [58]:
# ============================================================================
# Configuration
# ============================================================================

@dataclass
class PipelineConfig:
    """Configuration parameters for the wake word data pipeline."""
    
    # API Configuration
    api_key: str
    elevenlabs_url: str = "https://api.elevenlabs.io/v1/text-to-speech"
    
    # Directory Configuration
    base_folder: str = "../data"
    output_folder: str = "../data/processed"
    
    # Wake Word Configuration
    wake_word: str = "Jerry"
    
    # Audio Configuration
    audio_sample_rate: int = 44100
    audio_channels: int = 1
    background_duration_ms: int = 10000
    
    # Model Configuration
    tx: int = 5511
    ty: int = 1375
    n_freq: int = 101
    
    # Generation Configuration
    negatives_per_phrase: int = 4
    stability: float = 0.75
    similarity_boost: float = 0.75
    
    # Performance Configuration
    max_workers: int = 4
    retry_attempts: int = 3
    retry_delay: float = 1.0
    
    augmentations_per_voice: int = 10


In [59]:
# ============================================================================
# Pipeline Class
# ============================================================================

class WakeWordDataPipeline:
    """
    Pipeline for generating wake word detection training data.
    
    This class handles:
    - Positive sample generation (wake word)
    - Negative sample generation (similar words)
    - Audio format conversion
    - File organization and standardization
    - Metadata generation
    """
    
    def __init__(self, config: PipelineConfig):
        """
        Initialize the pipeline with configuration.
        
        Args:
            config: PipelineConfig object containing all settings
        """
        self.config = config
        self.logger = self._setup_logging()
        self.voice_ids = self._get_voice_ids()
        self.negative_phrases = self._get_negative_phrases()
        self.folders = self._create_directory_structure()
    
    # ------------------------------------------------------------------------
    # Setup Methods
    # ------------------------------------------------------------------------
    
    def _setup_logging(self) -> logging.Logger:
        """Configure logging for the pipeline."""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[logging.StreamHandler()]
        )
        return logging.getLogger(__name__)
    
    def _get_voice_ids(self) -> List[str]:
        """
        Get list of ElevenLabs voice IDs from a txt file.
        Lines starting with '#' are ignored.

        Returns:
            List of active voice ID strings
        """
        file_path = Path("../data/config/voice_ids.txt")
        if not file_path.exists():
            raise FileNotFoundError(f"Voice IDs file not found: {file_path}")

        with file_path.open("r") as f:
            voice_ids = [line.strip() for line in f if line.strip() and not line.startswith("#")]

        return voice_ids
    
    def _get_negative_phrases(self) -> List[str]:
        """
        Get list of negative phrases (similar-sounding words) from a txt file.
        Lines starting with '#' are ignored.

        Returns:
            List of phrase strings
        """
        file_path = Path("../data/config/negative_phrases.txt")
        if not file_path.exists():
            raise FileNotFoundError(f"Negative phrases file not found: {file_path}")

        with file_path.open("r") as f:
            negatives = [line.strip() for line in f if line.strip() and not line.startswith("#")]

        return negatives
    
    def _create_directory_structure(self) -> Dict[str, str]:
        """
        Create necessary directories for data storage.
        
        Returns:
            Dictionary mapping folder names to paths
        """
        base_path = Path(self.config.base_folder)
        
        folders = {
            'base': str(base_path),
            'activates': str(base_path / 'raw/activates'),
            'negatives': str(base_path / 'raw/negatives'),
            'backgrounds': str(base_path / 'raw/backgrounds'),
            'processed': str(base_path / 'processed'),
            'metadata': str(base_path / 'metadata')
        }
        
        for folder_path in folders.values():
            os.makedirs(folder_path, exist_ok=True)
        
        return folders
    
    # ------------------------------------------------------------------------
    # API Methods
    # ------------------------------------------------------------------------
    
    def _make_api_request(
        self, 
        voice_id: str, 
        text: str, 
        output_path: str
    ) -> Tuple[bool, str]:
        """
        Make TTS API request with retry logic.
        
        Args:
            voice_id: ElevenLabs voice ID
            text: Text to synthesize
            output_path: Where to save the audio file
            
        Returns:
            Tuple of (success: bool, message: str)
        """
        url = f"{self.config.elevenlabs_url}/{voice_id}/stream"
        
        headers = {
            "xi-api-key": self.config.api_key,
            "Content-Type": "application/json"
        }
        
        payload = {
            "text": text,
            "voice_settings": {
                "stability": self.config.stability,
                "similarity_boost": self.config.similarity_boost
            }
        }
        
        for attempt in range(self.config.retry_attempts):
            try:
                response = requests.post(
                    url, 
                    headers=headers, 
                    json=payload, 
                    timeout=30
                )
                
                if response.status_code == 200:
                    with open(output_path, 'wb') as f:
                        f.write(response.content)
                    return True, f"Success: {os.path.basename(output_path)}"
                
                if attempt == self.config.retry_attempts - 1:
                    return False, f"API Error {response.status_code}: {response.text}"
                
                time.sleep(self.config.retry_delay * (2 ** attempt))
                
            except requests.exceptions.RequestException as e:
                if attempt == self.config.retry_attempts - 1:
                    return False, f"Request failed: {str(e)}"
                time.sleep(self.config.retry_delay * (2 ** attempt))
        
        return False, "Max retries exceeded"
    
    # ------------------------------------------------------------------------
    # Generation Methods
    # ------------------------------------------------------------------------
    
    # ------------------------------------------------------------------------
    # Modified Positive Sample Generation (1-second enforced)
    # ------------------------------------------------------------------------
    def generate_positive_samples(self) -> int:
        """
        Generate positive samples (wake word audio) with augmentation.
        Each voice ID will generate multiple variants.
        Ensures every audio is exactly 1 second.
        """
        self.logger.info("Generating positive samples with augmentation...")
        total_generated = 0
        target_duration_ms = 1000  # 1 second in milliseconds

        for voice_id in self.voice_ids:
            base_filename = f"{self.config.wake_word.replace(' ', '_').lower()}_{voice_id}"
            temp_path = os.path.join(self.folders['activates'], f"tmp_{base_filename}.mp3")

            # Generate original voice sample via API
            success, message = self._make_api_request(voice_id, self.config.wake_word, temp_path)
            if not success:
                self.logger.error(f"Skipping {voice_id} due to API error.")
                continue

            # Load audio
            try:
                audio = AudioSegment.from_file(temp_path)
            except Exception as e:
                self.logger.error(f"Error loading audio for {voice_id}: {e}")
                continue

            # -----------------------------
            # Ensure exactly 1 second
            # -----------------------------
            if len(audio) < target_duration_ms:
                # Pad with silence at the end
                audio += AudioSegment.silent(duration=(target_duration_ms - len(audio)))
            # Trim to exactly 1 second
            audio = audio[:target_duration_ms]

            # Apply augmentations
            variants = self._augment_audio(audio)

            # Save all variants
            for i, variant in enumerate(variants):
                # Ensure each variant is also exactly 1 second after augmentation
                if len(variant) < target_duration_ms:
                    variant += AudioSegment.silent(duration=(target_duration_ms - len(variant)))
                variant = variant[:target_duration_ms]
                
                variant_path = os.path.join(
                    self.folders['activates'],
                    f"pos_{base_filename}_var{i+1}.wav"
                )
                variant.export(variant_path, format='wav')
                total_generated += 1

            # Remove temporary file
            try:
                os.remove(temp_path)
            except Exception:
                pass

            self.logger.info(f"Generated {len(variants)} variants for voice {voice_id}")

        self.logger.info(f"✅ Total positive samples generated (including augmentations): {total_generated}")
        return total_generated

    
    def _augment_audio(self, audio: AudioSegment) -> List[AudioSegment]:
        """
        Apply random augmentations to create multiple variants of a sample.
        
        Returns:
            List of augmented AudioSegment variants
        """
        variants = [audio]  # include original
        for _ in range(self.config.augmentations_per_voice - 1):
            aug = audio

            # Random pitch shift: ±3 semitones
            pitch_shift = random.uniform(-3, 3)
            new_frame_rate = int(aug.frame_rate * (2.0 ** (pitch_shift / 12.0)))
            aug = aug._spawn(aug.raw_data, overrides={'frame_rate': new_frame_rate})
            aug = aug.set_frame_rate(audio.frame_rate)

            # Random speed change: 0.9x–1.1x
            speed_factor = random.uniform(0.9, 1.1)
            aug = aug._spawn(aug.raw_data, overrides={
                "frame_rate": int(aug.frame_rate * speed_factor)
            }).set_frame_rate(audio.frame_rate)

            # Random volume adjustment: ±3 dB
            aug += random.uniform(-3, 3)

            # Random noise addition
            noise_level = random.uniform(-35, -25)
            noise = AudioSegment.silent(duration=len(aug)).overlay(
                AudioSegment.silent(duration=len(aug)).apply_gain(noise_level)
            )
            aug = aug.overlay(noise)

            variants.append(aug)
        return variants

    def generate_negative_samples(self) -> int:
        """
        Generate negative samples (similar-sounding words) with augmentation.
        Each phrase and voice ID combination will produce multiple augmented variants.
        Ensures every audio is exactly 1 second.
        
        Returns:
            Total number of generated augmented negative samples
        """
        self.logger.info("Generating negative samples with augmentation...")
        total_generated = 0
        target_duration_ms = 1000  # 1 second in milliseconds

        for phrase in self.negative_phrases:
            selected_voices = random.sample(
                self.voice_ids,
                min(self.config.negatives_per_phrase, len(self.voice_ids))
            )

            for voice_id in selected_voices:
                clean_phrase = phrase.replace(" ", "_").replace("'", "").lower()
                base_filename = f"{clean_phrase}_{voice_id}"
                temp_path = os.path.join(self.folders['negatives'], f"tmp_{base_filename}.mp3")

                # Generate base audio using API
                success, message = self._make_api_request(voice_id, phrase, temp_path)
                if not success:
                    self.logger.error(f"Skipping {voice_id} - '{phrase}' due to API error.")
                    continue

                # Load base audio
                try:
                    audio = AudioSegment.from_file(temp_path)
                except Exception as e:
                    self.logger.error(f"Error loading audio for {voice_id} - {phrase}: {e}")
                    continue

                # -----------------------------
                # Ensure exactly 1 second
                # -----------------------------
                if len(audio) < target_duration_ms:
                    # Pad with silence at the end
                    audio += AudioSegment.silent(duration=(target_duration_ms - len(audio)))
                # Trim to exactly 1 second
                audio = audio[:target_duration_ms]

                # Apply augmentations
                variants = self._augment_audio(audio)

                # Export all variants
                for i, variant in enumerate(variants):
                    # Ensure each variant is also exactly 1 second after augmentation
                    if len(variant) < target_duration_ms:
                        variant += AudioSegment.silent(duration=(target_duration_ms - len(variant)))
                    variant = variant[:target_duration_ms]
                    
                    variant_path = os.path.join(
                        self.folders['negatives'],
                        f"neg_{base_filename}_var{i+1}.wav"
                    )
                    variant.export(variant_path, format='wav')
                    total_generated += 1

                # Clean up temp file
                try:
                    os.remove(temp_path)
                except Exception:
                    pass

                self.logger.info(f"Generated {len(variants)} variants for negative phrase '{phrase}' using voice {voice_id}")

        self.logger.info(f"✅ Total negative samples generated (including augmentations): {total_generated}")
        return total_generated


    
    def _execute_generation_tasks(
        self, 
        tasks: List[Tuple[str, str, str]]
    ) -> int:
        """
        Execute generation tasks in parallel.
        
        Args:
            tasks: List of (voice_id, text, output_path) tuples
            
        Returns:
            Number of successful generations
        """
        success_count = 0
        
        with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
            future_to_task = {
                executor.submit(self._make_api_request, v, t, p): (v, t, p)
                for v, t, p in tasks
            }
            
            for future in as_completed(future_to_task):
                success, message = future.result()
                if success:
                    success_count += 1
                self.logger.info(message)
        
        return success_count
    
    # ------------------------------------------------------------------------
    # Audio Processing Methods
    # ------------------------------------------------------------------------
    
    def convert_to_wav(self, folder_path: str) -> int:
        """
        Convert all audio files in folder to WAV format.
        
        Args:
            folder_path: Path to folder containing audio files
            
        Returns:
            Number of files converted
        """
        self.logger.info(f"Converting audio in {folder_path} to WAV...")
        
        audio_extensions = ['*.mp3', '*.m4a', '*.mp4', '*.flac', '*.ogg']
        audio_files = [
            f for ext in audio_extensions 
            for f in glob.glob(os.path.join(folder_path, ext))
        ]
        
        count = 0
        for file_path in audio_files:
            try:
                audio = AudioSegment.from_file(file_path)
                audio = audio.set_channels(self.config.audio_channels)
                audio = audio.set_frame_rate(self.config.audio_sample_rate)
                
                wav_path = os.path.splitext(file_path)[0] + '.wav'
                audio.export(wav_path, format='wav')
                
                os.remove(file_path)
                count += 1
                
                self.logger.info(f"Converted: {os.path.basename(file_path)}")
                
            except Exception as e:
                self.logger.error(f"Error converting {file_path}: {str(e)}")
        
        return count
    
    def standardize_filenames(self, folder_path: str, prefix: str) -> int:
        """
        Rename files to standardized format (prefix-0001.wav, etc).
        
        Args:
            folder_path: Path to folder containing files
            prefix: Prefix for renamed files (e.g., 'pos', 'neg', 'bg')
            
        Returns:
            Number of files renamed
        """
        self.logger.info(f"Standardizing filenames in {folder_path}...")
        
        wav_files = sorted(glob.glob(os.path.join(folder_path, '*.wav')))
        
        for i, file_path in enumerate(wav_files, 1):
            new_path = os.path.join(folder_path, f"{prefix}-{i:04d}.wav")
            os.rename(file_path, new_path)
        
        return len(wav_files)
    
    def process_background_audio(self) -> int:
        """
        Process background audio to standard duration and format.
        
        Returns:
            Number of background files processed
        """
        self.logger.info("Processing background audio...")
        
        bg_files = glob.glob(os.path.join(self.folders['backgrounds'], '*.wav'))
        
        for file_path in bg_files:
            audio = AudioSegment.from_wav(file_path)
            audio = audio.set_channels(self.config.audio_channels)
            audio = audio.set_frame_rate(self.config.audio_sample_rate)
            
            if len(audio) < self.config.background_duration_ms:
                silence_duration = self.config.background_duration_ms - len(audio)
                audio += AudioSegment.silent(duration=silence_duration)
            
            audio = audio[:self.config.background_duration_ms]
            audio.export(file_path, format='wav')
        
        return len(bg_files)
    
    # ------------------------------------------------------------------------
    # Utility Methods
    # ------------------------------------------------------------------------
    
    def generate_metadata(self) -> None:
        """Generate and save dataset metadata."""
        metadata = {
            'config': asdict(self.config),
            'folders': self.folders
        }
        
        metadata_path = os.path.join(
            self.folders['metadata'],
            'dataset_info.json'
        )
        
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        self.logger.info(f"Metadata saved at {metadata_path}")
    
    def load_raw_audio(self) -> Tuple[List[AudioSegment], List[AudioSegment], List[AudioSegment]]:
        """
        Load generated audio files for verification.
        
        Returns:
            Tuple of (activates, negatives, backgrounds) audio segments
        """
        activates = self._load_audio_from_folder(self.folders['activates'])
        negatives = self._load_audio_from_folder(self.folders['negatives'])
        backgrounds = self._load_audio_from_folder(self.folders['backgrounds'])
        
        return activates, negatives, backgrounds
    
    def _load_audio_from_folder(self, folder_path: str) -> List[AudioSegment]:
        """
        Load all WAV files from a folder.
        
        Args:
            folder_path: Path to folder containing WAV files
            
        Returns:
            List of AudioSegment objects
        """
        audio_segments = []
        
        for filename in os.listdir(folder_path):
            if filename.endswith(".wav"):
                file_path = os.path.join(folder_path, filename)
                audio = AudioSegment.from_wav(file_path)
                audio_segments.append(audio)
        
        return audio_segments
    
    # ------------------------------------------------------------------------
    # Main Pipeline
    # ------------------------------------------------------------------------
    
    def run_full_pipeline(self) -> None:
        """Execute the complete data generation pipeline."""
        self.logger.info("Starting full pipeline...")
        
        # Generate samples
        self.generate_positive_samples()
        # self.generate_negative_samples() 
        
        # Convert to WAV
        self.convert_to_wav(self.folders['activates'])
        # self.convert_to_wav(self.folders['negatives'])
        # self.convert_to_wav(self.folders['backgrounds'])
        
        # Standardize filenames
        self.standardize_filenames(self.folders['activates'], 'pos')
        # self.standardize_filenames(self.folders['negatives'], 'neg')
        # self.standardize_filenames(self.folders['backgrounds'], 'bg')
        
        # Process backgrounds
        # self.process_background_audio()
        
        # Generate metadata
        # self.generate_metadata()
        
        self.logger.info("Pipeline finished!")

In [60]:
# ============================================================================
# Main Execution
# ============================================================================
def main():
    """Main execution function."""
    
    # Load environment variables
    load_dotenv()
    api_key = os.getenv("ELEVEN_API_KEY")
    
    if not api_key:
        raise ValueError("ELEVEN_API_KEY not found. Please add it to your .env file.")
    
    # Create configuration
    config = PipelineConfig(
        api_key=api_key,
        base_folder="../data",
        wake_word="Jerry"
    )
    
    # Initialize and run pipeline
    pipeline = WakeWordDataPipeline(config)
    pipeline.run_full_pipeline()
    
    # Load and display results
    print("\nLoading generated audio files...")
    activates, negatives, backgrounds = pipeline.load_raw_audio()
    
    print(f"✓ Loaded {len(activates)} activate samples")
    print(f"✓ Loaded {len(negatives)} negative samples")
    print(f"✓ Loaded {len(backgrounds)} background samples")
    
    print("=" * 60)
    print("DATASET SUMMARY")
    print("=" * 60)
    print(f"Total activate samples: {len(activates)}")
    print(f"Total negative samples: {len(negatives)}")
    print(f"Total background samples: {len(backgrounds)}")
    print(f"\nDataset ready for model training!")

    # return objects for inspection in notebook
    return config, activates, negatives, backgrounds


In [61]:
config, activates, negatives, backgrounds = main()

2025-10-26 01:34:04,261 - INFO - Starting full pipeline...
2025-10-26 01:34:04,262 - INFO - Generating positive samples with augmentation...


2025-10-26 01:34:06,012 - INFO - Generated 10 variants for voice 5mZxJZhSmJTjL7GoYfYI
2025-10-26 01:34:07,686 - INFO - Generated 10 variants for voice WuLq5z7nEcrhppO0ZQJw
2025-10-26 01:34:07,688 - INFO - ✅ Total positive samples generated (including augmentations): 20
2025-10-26 01:34:07,689 - INFO - Converting audio in ..\data\raw\activates to WAV...
2025-10-26 01:34:07,695 - INFO - Standardizing filenames in ..\data\raw\activates...
2025-10-26 01:34:07,762 - INFO - Pipeline finished!



Loading generated audio files...
✓ Loaded 850 activate samples
✓ Loaded 0 negative samples
✓ Loaded 0 background samples
DATASET SUMMARY
Total activate samples: 850
Total negative samples: 0
Total background samples: 0

Dataset ready for model training!


In [26]:
IPython.display.Audio(config.base_folder + "/raw/activates/pos-0001.wav")


In [14]:
IPython.display.Audio(config.base_folder + "/raw/negatives/neg-0001.wav")

ValueError: rate must be specified when data is a numpy array or list of audio samples.

In [None]:
IPython.display.Audio(config.base_folder + "/raw/backgrounds/bg-0001.wav")

In [5]:
import librosa
import soundfile as sf

def convert_to_16khz(input_path, output_path):
    """Convert audio file to 16kHz sample rate"""
    # Load audio at original sample rate
    audio, sr = librosa.load(input_path, sr=None)  # sr=None keeps original rate
    
    # Resample to 16kHz
    audio_16k = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    
    # Save as wav
    sf.write(output_path, audio_16k, 16000)
    
    return audio_16k

# Usage
convert_to_16khz('../data/raw/activates/pos-0001.wav', '../data/raw/negatives/pos-0001.wav')

array([2.6926386e-12, 1.2277397e-14, 3.1643004e-12, ..., 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00], shape=(16000,), dtype=float32)

In [7]:
import librosa
import soundfile as sf
import os
from pathlib import Path
from tqdm import tqdm

def convert_to_16khz(input_path, output_path):
    """Convert audio file to 16kHz sample rate"""
    # Load audio at original sample rate
    audio, sr = librosa.load(input_path, sr=None)  # sr=None keeps original rate
   
    # Resample to 16kHz
    audio_16k = librosa.resample(audio, orig_sr=sr, target_sr=16000)
   
    # Save as wav
    sf.write(output_path, audio_16k, 16000)
   
    return audio_16k

def convert_folder_to_16khz(input_folder, output_folder):
    """
    Convert all wav files in a folder to 16kHz
    
    Args:
        input_folder: Path to input folder containing wav files
        output_folder: Path to output folder (will be created if doesn't exist)
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all .wav files from input folder
    input_path = Path(input_folder)
    wav_files = list(input_path.glob('*.wav'))
    
    if len(wav_files) == 0:
        print(f"No .wav files found in {input_folder}")
        return
    
    print(f"Found {len(wav_files)} wav files in {input_folder}")
    print(f"Converting to 16kHz and saving to {output_folder}...")
    
    success_count = 0
    error_count = 0
    
    for input_file in tqdm(wav_files, desc="Converting"):
        try:
            # Create output path with same filename
            output_file = Path(output_folder) / input_file.name
            
            # Convert the file
            convert_to_16khz(str(input_file), str(output_file))
            success_count += 1
            
        except Exception as e:
            print(f"\n❌ Error converting {input_file.name}: {e}")
            error_count += 1
    
    print(f"\n✅ Conversion complete!")
    print(f"   Successfully converted: {success_count}")
    print(f"   Errors: {error_count}")

# Usage examples:

# Convert single folder
convert_folder_to_16khz(
    input_folder='../data/raw/activates',
    output_folder='../data/processed/activates_16k'
)

# Convert multiple folders
folders_to_convert = [
    ('../data/raw/activates', '../data/processed/activates_16k')
]

for input_folder, output_folder in folders_to_convert:
    if os.path.exists(input_folder):
        convert_folder_to_16khz(input_folder, output_folder)
    else:
        print(f"⚠️ Folder not found: {input_folder}")

Found 850 wav files in ../data/raw/activates
Converting to 16kHz and saving to ../data/processed/activates_16k...


Converting:   0%|          | 0/850 [00:00<?, ?it/s]

Converting: 100%|██████████| 850/850 [00:08<00:00, 96.23it/s]



✅ Conversion complete!
   Successfully converted: 850
   Errors: 0
Found 850 wav files in ../data/raw/activates
Converting to 16kHz and saving to ../data/processed/activates_16k...


Converting: 100%|██████████| 850/850 [00:01<00:00, 475.76it/s]


✅ Conversion complete!
   Successfully converted: 850
   Errors: 0



