# Piper Voice Training Notebook
This notebook automates the process of training a Piper voice model, including:
1. Dataset preprocessing
2. Model training
3. Model exporting
4. Model testing

## Step 1: Verify Installation
Ensure that Piper and its dependencies are installed correctly.

In [None]:
# Verify Piper installation
import piper_train

# Print Piper version
print(f"Piper version: {piper_train.__version__}")

## Step 2: Verify GPU Availability
Check if a GPU is available for training.

In [None]:
# Verify GPU availability
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available. Training will use the CPU.")

## Step 3: Convert and Replace Audio Files
Convert all audio files in the `dataset/wav` folder to WAV format and replace the originals.

In [None]:
import os
from pathlib import Path
import soundfile as sf
import scipy.io.wavfile
import numpy as np
from tqdm import tqdm

# Directory setup
wav_dir = "dataset/wav"

# Collect all audio files in the wav directory
audio_files = list(Path(wav_dir).glob("*.*"))  # This will match any file extension
print(f"Number of audio files found: {len(audio_files)}")

if audio_files:
    corrupted_files = []

    print("Converting and replacing audio files in WAV format...")
    for file_path in tqdm(audio_files, desc="Processing audio files"):
        try:
            # Skip files that are already in WAV format
            if file_path.suffix.lower() == ".wav":
                continue

            # Attempt to load the file and handle any errors
            audio, sampling_rate = sf.read(file_path)
            
            if audio is None or len(audio) == 0:
                raise ValueError(f"Empty or invalid audio data in file: {file_path}")

            # Resample audio to 16kHz (optional, adjust as needed)
            target_sampling_rate = 16000
            if sampling_rate != target_sampling_rate:
                audio = scipy.signal.resample(audio, int(len(audio) * target_sampling_rate / sampling_rate))

            # Save the audio as a WAV file, replacing the original file
            output_path = file_path.with_suffix(".wav")  # Replace the extension with .wav
            scipy.io.wavfile.write(
                output_path,
                target_sampling_rate,
                (audio * 32767).astype(np.int16),  # Scale to 16-bit PCM
            )

            # Remove the original file if it's not already a WAV file
            if file_path.suffix.lower() != ".wav":
                os.remove(file_path)

        except (sf.LibsndfileError, ValueError, Exception) as e:
            # Log the error and skip the file
            print(f"Error converting {file_path}: {e}")
            corrupted_files.append(str(file_path))

    # Log corrupted files
    if corrupted_files:
        log_path = Path(wav_dir) / "corrupted_files.log"
        with open(log_path, "w") as log_file:
            log_file.writelines(f"{file}\n" for file in corrupted_files)
        print(f"Logged corrupted files to {log_path}")
else:
    print("No audio files found in the directory.")

print("Audio conversion and replacement complete!")

## Step 4: Transcribe Audio Files
Generate `metadata.csv` by transcribing all `.wav` files in the `wav/` directory.

In [None]:
import os
import whisper
import csv

# Load the Whisper model
model = whisper.load_model("base")  # You can use "small", "medium", or "large" for better accuracy

# Define paths
wav_dir = "dataset/wav"
metadata_path = "dataset/metadata.csv"

# Create metadata.csv
with open(metadata_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="|")
    
    # Iterate over all .wav files in the wav directory
    for filename in os.listdir(wav_dir):
        if filename.endswith(".wav"):
            # Get the full path to the audio file
            audio_path = os.path.join(wav_dir, filename)
            
            # Transcribe the audio file
            result = model.transcribe(audio_path)
            text = result["text"].strip()
            
            # Write the ID and transcription to metadata.csv
            id = filename[:-4]  # Remove the .wav extension
            writer.writerow([id, text])
            
            print(f"Transcribed {filename}: {text}")

print(f"\nMetadata file generated at: {metadata_path}")

## Step 5: Verify `metadata.csv`
Check the generated `metadata.csv` file to ensure the transcriptions are accurate.

In [None]:
import pandas as pd

# Load and display the metadata.csv file
metadata = pd.read_csv(metadata_path, delimiter="|", header=None, names=["id", "text"])
print(metadata)

## Step 6: Preprocess Dataset
Run the preprocessing script to generate `config.json` and `dataset.jsonl`.

In [None]:
!python3 -m piper_train.preprocess \
  --language en-us \
  --input-dir dataset \
  --output-dir training_dir \
  --dataset-format ljspeech \
  --single-speaker \
  --sample-rate 22050

## Step 7: Train the Model
Train the Piper voice model using the preprocessed dataset.

In [None]:
# Download a pre-trained model checkpoint (e.g., lessac medium quality)
!wget https://example.com/path/to/lessac/epoch=2164-step=1355540.ckpt -O lessac.ckpt

# Train the model
!python3 -m piper_train \
    --dataset-dir training_dir \
    --accelerator 'gpu' \
    --devices 1 \
    --batch-size 32 \
    --validation-split 0.0 \
    --num-test-examples 0 \
    --max_epochs 10000 \
    --resume_from_checkpoint lessac.ckpt \
    --checkpoint-epochs 1 \
    --precision 32

## Step 8: Export the Model
Export the trained model to ONNX format.

In [None]:
# Find the latest checkpoint
import glob
checkpoints = glob.glob("training_dir/lightning_logs/version_0/checkpoints/*.ckpt")
latest_checkpoint = checkpoints[-1]

# Export to ONNX
!python3 -m piper_train.export_onnx \
    {latest_checkpoint} \
    model.onnx

# Copy config.json
!cp training_dir/config.json model.onnx.json

## Step 9: Test the Model
Test the exported model by generating audio from text.

In [None]:
# Create a test sentence
test_sentence = "This is a test sentence generated by Piper."

# Generate audio
!echo '{test_sentence}' | \
  piper -m model.onnx --output_file test.wav

# Play the audio (requires IPython and sound playback support)
from IPython.display import Audio
Audio("test.wav")

## Step 10: Monitor Training with TensorBoard
Monitor training progress using TensorBoard.

In [None]:
# Start TensorBoard
%load_ext tensorboard
%tensorboard --logdir training_dir/lightning_logs

## Step 11: Export and Download the Model
Export the trained model and provide download links for the ONNX model and JSON config.

In [None]:
import shutil
import json
from IPython.display import FileLink

# Define the source paths for the ONNX model and config
onnx_source_path = "model.onnx"
config_source_path = "model.onnx.json"

# Define the destination paths for download
onnx_destination_path = "./piper_model.onnx"
config_destination_path = "./piper_model.json"

# Copy the ONNX model and config to the current working directory
shutil.copy(onnx_source_path, onnx_destination_path)
shutil.copy(config_source_path, config_destination_path)

# Define the JSON file content for Piper
piper_json_data = {
    "dataset": "norman",  # Update this with your dataset name
    "audio": {
        "sample_rate": 22050,  # Update this based on your model's sample rate
        "quality": "medium"    # Update this based on your model's quality
    },
    "espeak": {
        "voice": "en"  # Update this based on your model's espeak voice
    },
    "language": {
        "code": "en_US",        # Update this based on your model's language
        "family": "en",         # Language family
        "region": "US",         # Region
        "name_native": "English",  # Native language name
        "name_english": "English", # English language name
        "country_english": "United States"  # Country name in English
    },
    "inference": {
        "noise_scale": 0.667,  # Noise scale for inference
        "length_scale": 1.0,   # Length scale for inference
        "noise_w": 0.8         # Noise width for inference
    },
    "phoneme_type": "espeak",  # Phoneme type (e.g., espeak or text)
    "phoneme_map": {},         # Phoneme map (if applicable)
    "phoneme_id_map": {
        " ": [3],  # Word separator
        "!": [4],  # Exclamation mark
        "$": [2],  # End of utterance
        "'": [5],  # Single quote
        "(": [6],  # Open parenthesis
        ")": [7],  # Close parenthesis
        ",": [8],  # Comma
        "-": [9],  # Hyphen
        ".": [10],  # Period
        "0": [130],  # Number 0
        "1": [131],  # Number 1
        "2": [132],  # Number 2
        "3": [133],  # Number 3
        "4": [134],  # Number 4
        "5": [135],  # Number 5
        "6": [136],  # Number 6
        "7": [137],  # Number 7
        "8": [138],  # Number 8
        "9": [139],  # Number 9
        ":": [11],  # Colon
        ";": [12],  # Semicolon
        "?": [13],  # Question mark
        "^": [1],  # Beginning of utterance
        "_": [0],  # Padding
        "a": [14],  # Phoneme a
        "b": [15],  # Phoneme b
        "c": [16],  # Phoneme c
        "d": [17],  # Phoneme d
        "e": [18],  # Phoneme e
        "f": [19],  # Phoneme f
        "g": [20],  # Phoneme g
        "h": [21],  # Phoneme h
        "i": [22],  # Phoneme i
        "j": [23],  # Phoneme j
        "k": [24],  # Phoneme k
        "l": [25],  # Phoneme l
        "m": [26],  # Phoneme m
        "n": [27],  # Phoneme n
        "o": [28],  # Phoneme o
        "p": [29],  # Phoneme p
        "q": [30],  # Phoneme q
        "r": [31],  # Phoneme r
        "s": [32],  # Phoneme s
        "t": [33],  # Phoneme t
        "u": [34],  # Phoneme u
        "v": [35],  # Phoneme v
        "w": [36],  # Phoneme w
        "x": [37],  # Phoneme x
        "y": [38],  # Phoneme y
        "z": [39]   # Phoneme z
    },
    "num_symbols": 256,  # Number of phonemes in the model
    "num_speakers": 1,   # Number of speakers in the model
    "speaker_id_map": {},  # Speaker ID map (if applicable)
    "piper_version": "1.0.0"  # Piper version
}

# Write the JSON file
with open(config_destination_path, "w") as json_file:
    json.dump(piper_json_data, json_file, indent=2)

# Generate download links for both files
print("Download your files:")
print("ONNX Model:")
display(FileLink(onnx_destination_path))
print("\nJSON Config:")
display(FileLink(config_destination_path))