In [None]:
import numpy as np
import torch
from pathlib import Path

## Step 1: Preprocessing Audio Data

Preprocessing involves:
1. Loading and segmenting audio files
2. Extracting content features (ContentVec/HuBERT)
3. Extracting pitch features (F0)
4. Building FAISS index for retrieval

In [None]:
from src.modules.preprocessing import preprocess_for_training

data_dir = "../data/my_voice"
out_dir = "../data/preprocessed"

preprocess_for_training(
    data_dir=data_dir,
    out_dir=out_dir,
    quiet=False,
)

## Step 2: Training the Generator

Train the voice conversion model on preprocessed features.

In [None]:
from src.modules.training import train_generator_from_features

feature_dir = "../data/preprocessed"
model_name = "my_voice_model"

generator = train_generator_from_features(
    feature_dir=feature_dir,
    epochs=100,
    batch_size=4,
    content_dim=768,
    use_pitch=True,
    target_sr=48000,
    learning_rate=1e-4,
    model_name=model_name,
)

## Step 3: Running Inference

Convert a source audio file to the target speaker's voice.

In [None]:
from src.modules.inference import run_inference
from src.models.generator import Generator

input_audio = "../data/references/california_gurls.mp3"
output_audio = "../data/output/converted_california_gurls.wav"
model_name = "my_voice_model"

generator = Generator.load(model_name)

run_inference(
    input_path=input_audio,
    generator=generator,
    output_path=output_audio,
)

In [None]:
import IPython.display as ipd
import matplotlib.pyplot as plt
import torchaudio

original_path = "../data/references/california_gurls.mp3"
original_waveform, original_sr = torchaudio.load(original_path)

print("Original Audio:")
ipd.display(ipd.Audio(original_waveform.numpy(), rate=original_sr))

converted_path = "../data/output/converted_california_gurls.wav"
try:
    converted_waveform, converted_sr = torchaudio.load(converted_path)
    print("\nConverted Audio:")
    ipd.display(ipd.Audio(converted_waveform.numpy(), rate=converted_sr))
    
    fig, axes = plt.subplots(2, 1, figsize=(12, 6))
    
    axes[0].plot(original_waveform[0].numpy()[:48000])
    axes[0].set_title("Original Audio (first 1 second)")
    axes[0].set_xlabel("Sample")
    axes[0].set_ylabel("Amplitude")
    
    axes[1].plot(converted_waveform[0].numpy()[:48000])
    axes[1].set_title("Converted Audio (first 1 second)")
    axes[1].set_xlabel("Sample")
    axes[1].set_ylabel("Amplitude")
    
    plt.tight_layout()
    plt.show()
except FileNotFoundError:
    print("Converted audio not found. Run inference first!")