# Improved "Hey Nap" Wake Word Training

This notebook uses improved training configuration for better model performance.

**Improvements:**
- Larger model (256 units, 2 blocks)
- More training data (200k samples)
- Longer training (20k steps)
- Better augmentation (2 rounds)
- Adversarial negative examples

In [None]:
# Install OpenWakeWord and dependencies
!pip install -q openwakeword torch torchvision torchaudio numpy scipy scikit-learn pyyaml tqdm
print("‚úÖ Dependencies installed")

In [None]:
# Clone OpenWakeWord repository for training script
import os

if not os.path.exists('openwakeword'):
    print("üì• Cloning OpenWakeWord repository...")
    !git clone https://github.com/dscripka/openWakeWord.git openwakeword
    print("‚úÖ Repository cloned")
else:
    print("‚úÖ Repository already exists")

# Install piper TTS sample generator
if not os.path.exists('piper_sample_generator'):
    print("\nüì• Setting up Piper TTS sample generator...")
    !git clone https://github.com/rhasspy/piper-sample-generator.git piper_sample_generator
    print("‚úÖ Piper setup complete")
else:
    print("‚úÖ Piper already set up")

In [None]:
# Create improved training config with ALL required fields
import yaml
import os

config = {
    # Required: Wake word
    'target_phrase': ['hey nap'],
    
    # Required: Model config
    'model_name': 'hey_nap',
    'model_type': 'dnn',
    'layer_size': 256,  # Increased from 128
    'n_blocks': 2,       # Increased from 1
    
    # Required: Training data
    'n_samples': 200000,  # Increased from 100000
    'n_samples_val': 20000,  # Increased from 10000
    
    # Required: Paths
    'output_dir': './my_custom_model',
    'piper_sample_generator_path': './piper_sample_generator',
    
    # Required: Augmentation
    'augmentation_rounds': 2,  # Increased from 1
    'augmentation_batch_size': 32,  # Increased from 16
    
    # Required: Training params
    'steps': 20000,  # Increased from 10000
    'max_negative_weight': 15,  # Increased from 10
    'target_false_positives_per_hour': 0.3,  # Lowered from 0.5
    'batch_n_per_class': 8,  # REQUIRED for training
    
    # Required: TTS
    'tts_batch_size': 32,  # Increased from 16
    
    # Required: Background/RIR (can be empty)
    'rir_paths': [],
    'background_paths': [],
    'background_paths_duplication_rate': [1],
    
    # Required: Negative phrases
    'custom_negative_phrases': [
        'hey map', 'hey nab', 'hey napkin', 'hey nappy',
        'hey napper', 'hey napping', 'hey napped', 'hey napp',
        'hey napa', 'hey nappie', 'hey nappier', 'hey nappies'
    ],
    
    # Required: Feature files (can be empty)
    'feature_data_files': {},
    
    # Required: Validation (can be empty)
    'false_positive_validation_data_path': ''
}

# Save config
with open('training_config_hey_nap_improved.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("‚úÖ Improved config created with ALL required fields")
print(f"   Model: {config['layer_size']} units, {config['n_blocks']} blocks")
print(f"   Training samples: {config['n_samples']:,}")
print(f"   Training steps: {config['steps']:,}")
print(f"   Batch size: {config['batch_n_per_class']} per class")
print(f"   Piper path: {config['piper_sample_generator_path']}")

# Verify piper path exists
if os.path.exists(config['piper_sample_generator_path']):
    print(f"   ‚úÖ Piper path exists")
else:
    print(f"   ‚ö†Ô∏è  Piper path does not exist - make sure you ran the clone cell above")

In [None]:
# Optional: Upload real training data
from google.colab import files
import os

print("üì§ (Optional) Upload real 'Hey Nap' audio samples")
print("   Upload WAV files of people saying 'Hey Nap'")
print("   Press Enter to skip, or upload files...")

# Uncomment to enable upload:
# uploaded = files.upload()
# print(f"‚úÖ Uploaded {len(uploaded)} files")

In [None]:
# Run training using the OpenWakeWord training script
import subprocess
import sys
import os

print("üöÄ Starting improved training...")
print("   This will take 1-2 hours")
print("   Model: 256 units, 2 blocks")
print("   Training samples: 200,000")
print("   Training steps: 20,000")
print()

# Path to training script
train_script = './openwakeword/openwakeword/train.py'
config_file = 'training_config_hey_nap_improved.yaml'

# Pre-flight checks
checks_passed = True

if not os.path.exists(train_script):
    print(f"‚ùå Training script not found: {train_script}")
    print("   Make sure the openwakeword repository was cloned")
    checks_passed = False
else:
    print(f"‚úÖ Training script found")

if not os.path.exists(config_file):
    print(f"‚ùå Config file not found: {config_file}")
    print("   Run the config creation cell above first")
    checks_passed = False
else:
    print(f"‚úÖ Config file found")

piper_path = './piper_sample_generator'
if not os.path.exists(piper_path):
    print(f"‚ùå Piper path not found: {piper_path}")
    print("   Make sure you ran the clone cell above")
    checks_passed = False
else:
    print(f"‚úÖ Piper path found")

if not checks_passed:
    print("\n‚ùå Pre-flight checks failed. Fix the issues above and try again.")
else:
    print("\n‚úÖ All checks passed. Starting training...")
    print()
    
    # Run training with all flags
    cmd = [
        sys.executable,
        train_script,
        '--training_config', config_file,
        '--generate_clips',
        '--augment_clips',
        '--train_model'
    ]
    
    print(f"Command: {' '.join(cmd)}")
    print("\n‚ö†Ô∏è  This will take 1-2 hours. Watch for progress messages.")
    print("   If you see errors, scroll up to see the full error message.")
    print()
    
    # Execute training
    result = subprocess.run(cmd, check=False)
    
    print("\n" + "="*70)
    if result.returncode == 0:
        print("‚úÖ Training complete!")
        print("   Check the output directory for your model")
    else:
        print(f"‚ùå Training failed with exit code: {result.returncode}")
        print("\nüí° Common issues:")
        print("   1. Missing piper_sample_generator - check if it was cloned")
        print("   2. Missing config fields - check config file")
        print("   3. GPU out of memory - try reducing n_samples or batch sizes")
        print("   4. Check error messages above for specific issues")
        print("\nüí° Alternative: Use the official OpenWakeWord Colab notebook:")
        print("   https://colab.research.google.com/drive/1q1oe2zOyZp7UsB3jJiQ1IFn8z5YfjwEb")
    print("="*70)

In [None]:
# Check training output and download model
from google.colab import files
import os

model_name = 'hey_nap'
output_dir = f"my_custom_model/{model_name}"
onnx_file = f"{output_dir}/{model_name}.onnx"
tflite_file = f"{output_dir}/{model_name}.tflite"

print("üìã Checking for trained models...")
print(f"   Output directory: {output_dir}")
print(f"   ONNX: {onnx_file}")
print(f"   TFLite: {tflite_file}")
print()

if os.path.exists(onnx_file):
    size = os.path.getsize(onnx_file) / (1024*1024)
    print(f"‚úÖ Found ONNX model: {onnx_file} ({size:.1f} MB)")
    print(f"üì• Downloading...")
    files.download(onnx_file)
    print("\n‚úÖ ONNX model downloaded!")
    print("\nüìù Next steps:")
    print("   1. Convert to TFLite using Docker: ./convert_with_docker.sh")
    print("   2. Test: python3 test_hey_nap_local.py --model hey_nap.tflite")
    print("   3. Deploy: cp hey_nap.tflite components/openwakeword/models/")
elif os.path.exists(tflite_file):
    size = os.path.getsize(tflite_file) / (1024*1024)
    print(f"‚úÖ Found TFLite model: {tflite_file} ({size:.1f} MB)")
    print(f"üì• Downloading...")
    files.download(tflite_file)
    print("\n‚úÖ TFLite model downloaded!")
    print("\nüìù Next steps:")
    print("   1. Test: python3 test_hey_nap_local.py --model hey_nap.tflite")
    print("   2. Deploy: cp hey_nap.tflite components/openwakeword/models/")
else:
    print(f"‚ùå Model not found in expected location")
    print(f"\nüìÅ Checking output directory structure...")
    
    if os.path.exists(output_dir):
        print(f"\n   Directory exists: {output_dir}")
        print(f"   Contents:")
        for item in sorted(os.listdir(output_dir)):
            item_path = os.path.join(output_dir, item)
            if os.path.isfile(item_path):
                size = os.path.getsize(item_path) / (1024*1024)
                print(f"     üìÑ {item} ({size:.2f} MB)")
            else:
                count = len(os.listdir(item_path)) if os.path.isdir(item_path) else 0
                print(f"     üìÅ {item}/ ({count} items)")
    else:
        print(f"\n   Output directory not found: {output_dir}")
        print(f"   Training may not have completed successfully.")
    
    print("\nüí° If training failed:")
    print("   1. Scroll up to see the full error message in the training cell")
    print("   2. Check that piper_sample_generator was cloned")
    print("   3. Verify config file has all required fields")
    print("   4. Try the official OpenWakeWord Colab notebook:")
    print("      https://colab.research.google.com/drive/1q1oe2zOyZp7UsB3jJiQ1IFn8z5YfjwEb")