In [1]:
from num2words import num2words
import re

def transform_numbers_to_text(text):
    # Function to replace numbers in text with their full text representation
    def replace_number(match):
        number = int(match.group())
        # Convert number to Portuguese words
        return num2words(number, lang='pt_BR')
    
    # Regular expression to find numbers in the text
    text_with_numbers_transformed = re.sub(r'\d+', replace_number, text)
    return text_with_numbers_transformed

def handle_special_cases(text):
    # Replace specific patterns for better formatting
    text = text.replace(" e um mil", " e mil")  # Fix: "mil" doesn't need "um" before it in Portuguese
    text = text.replace("um mil ", "mil ")  # Avoid redundant "um mil"
    return text

# Example usage
input_text = "10 de Abril de 1929"
transformed_text = transform_numbers_to_text(input_text)
final_text = handle_special_cases(transformed_text)

print(final_text)

dez de Abril de mil, novecentos e vinte e nove


In [None]:
# üéôÔ∏è F5-TTS Test - Gera√ß√£o de Voz Nativa (Sem API)

Este notebook permite testar o modelo F5-TTS treinado diretamente, sem necessidade de API ou containers Docker.

## üìã Features:
- ‚úÖ Carregamento direto do checkpoint fine-tuned
- ‚úÖ Gera√ß√£o de √°udio com voz clonada
- ‚úÖ Compara√ß√£o de qualidade (pretrained vs fine-tuned)
- ‚úÖ Exporta√ß√£o em WAV e MP3
- ‚úÖ Visualiza√ß√£o de spectrograms
- ‚úÖ M√©tricas de qualidade

## 1. Setup e Imports

In [None]:
import os
import sys
import torch
import torchaudio
import numpy as np
import soundfile as sf
from pathlib import Path
from IPython.display import Audio, display
import matplotlib.pyplot as plt
import librosa
import librosa.display

# F5-TTS imports
from f5_tts.model import DiT
from f5_tts.infer.utils_infer import (
    load_model,
    load_vocoder,
    infer_process,
    preprocess_ref_audio_text
)

print(f"‚úÖ PyTorch version: {torch.__version__}")
print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"‚úÖ CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"‚úÖ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Configura√ß√£o de Paths e Device

In [None]:
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üéØ Using device: {device}")

# Paths
BASE_DIR = Path("/home/tts-webui-proxmox-passthrough")
TRAIN_DIR = BASE_DIR / "train"
OUTPUT_DIR = TRAIN_DIR / "output" / "ptbr_finetuned2"
CHECKPOINT_PATH = OUTPUT_DIR / "model_last.pt"
SAMPLES_DIR = OUTPUT_DIR / "samples"

# Create test output directory
TEST_OUTPUT_DIR = TRAIN_DIR / "test_output"
TEST_OUTPUT_DIR.mkdir(exist_ok=True)

print(f"\nüìÅ Checkpoint: {CHECKPOINT_PATH}")
print(f"üìÅ Test output: {TEST_OUTPUT_DIR}")
print(f"‚úÖ Checkpoint exists: {CHECKPOINT_PATH.exists()}")

if CHECKPOINT_PATH.exists():
    checkpoint_size = CHECKPOINT_PATH.stat().st_size / (1024**3)
    print(f"üìä Checkpoint size: {checkpoint_size:.2f} GB")

## 3. Carregar Modelo F5-TTS Fine-Tuned

In [None]:
print("üîÑ Loading F5-TTS model...")

# Load model using custom checkpoint
model = load_model(
    model_cls=DiT,
    model_cfg=dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
    ckpt_path=str(CHECKPOINT_PATH),
    mel_spec_type="vocos",
    vocab_file="",
    ode_method="euler",
    use_ema=True,
    device=device
)

print("‚úÖ Model loaded successfully!")
print(f"üìä Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")

# Load vocoder
print("\nüîÑ Loading Vocos vocoder...")
vocoder = load_vocoder(vocoder_name="vocos", is_local=False, local_path="")
print("‚úÖ Vocoder loaded successfully!")

## 4. Preparar √Åudio de Refer√™ncia

Use um √°udio de voz existente para clonagem. Pode ser:
- Um arquivo da pasta `uploads/`
- Um sample do treinamento
- Qualquer arquivo WAV de 3-30 segundos

In [None]:
# Op√ß√£o 1: Usar sample do treinamento
ref_audio_path = SAMPLES_DIR / "update_33200_ref.wav"

# Op√ß√£o 2: Usar arquivo da pasta uploads (descomente se preferir)
# ref_audio_path = BASE_DIR / "uploads" / "seu_arquivo.wav"

# Op√ß√£o 3: Caminho customizado (descomente e ajuste)
# ref_audio_path = Path("/path/to/your/reference.wav")

if not ref_audio_path.exists():
    print(f"‚ùå Arquivo n√£o encontrado: {ref_audio_path}")
    print("\nüìÅ Samples dispon√≠veis:")
    if SAMPLES_DIR.exists():
        for f in SAMPLES_DIR.glob("*.wav"):
            print(f"  - {f.name}")
else:
    print(f"‚úÖ √Åudio de refer√™ncia: {ref_audio_path.name}")
    
    # Load and display audio info
    audio, sr = sf.read(str(ref_audio_path))
    duration = len(audio) / sr
    print(f"üìä Sample rate: {sr} Hz")
    print(f"üìä Duration: {duration:.2f}s")
    print(f"üìä Channels: {audio.shape}")
    
    # Play audio in notebook
    display(Audio(str(ref_audio_path)))

## 5. Definir Texto de Refer√™ncia e Texto para Gerar

**IMPORTANTE:** O F5-TTS funciona melhor quando voc√™ fornece a transcri√ß√£o exata do √°udio de refer√™ncia (`ref_text`).

In [None]:
# Transcri√ß√£o do √°udio de refer√™ncia
# IMPORTANTE: Deve ser a transcri√ß√£o EXATA do √°udio de refer√™ncia
ref_text = """
Ol√°, este √© um teste de s√≠ntese de voz com o modelo F5-TTS fine-tuned em portugu√™s brasileiro.
"""

# Texto que voc√™ quer gerar com a voz clonada
gen_text = """
Bem-vindo ao teste de gera√ß√£o de voz usando F5-TTS. 
Este modelo foi treinado especificamente para portugu√™s brasileiro, 
garantindo naturalidade e expressividade em cada palavra falada.
A tecnologia de flow matching permite uma s√≠ntese de alta qualidade, 
mantendo as caracter√≠sticas √∫nicas da voz de refer√™ncia.
"""

# Limpar textos
ref_text = ref_text.strip()
gen_text = gen_text.strip()

print("üìù Texto de refer√™ncia:")
print(f"   {ref_text[:100]}...")
print(f"\nüìù Texto para gerar ({len(gen_text)} caracteres):")
print(f"   {gen_text[:150]}...")

## 6. Gerar √Åudio com F5-TTS

Usando os par√¢metros EXATOS do treinamento para qualidade m√°xima:
- `nfe_step=32` (padr√£o da biblioteca)
- `cfg_strength=2.0` (guidance)
- `sway_sampling_coef=-1.0` (auto)

In [None]:
import time

print("üéôÔ∏è Gerando √°udio com F5-TTS...")
start_time = time.time()

# Preprocess reference audio
ref_audio, ref_text_processed = preprocess_ref_audio_text(
    str(ref_audio_path),
    ref_text,
    device=device
)

print(f"‚úÖ Reference audio preprocessed: {ref_audio.shape}")

# Generate speech using infer_process
# ‚ö†Ô∏è IMPORTANTE: Usando par√¢metros ID√äNTICOS ao treinamento!
audio_output, sample_rate, _ = infer_process(
    ref_audio=ref_audio,
    ref_text=ref_text_processed,
    gen_text=gen_text,
    model_obj=model,
    vocoder=vocoder,
    mel_spec_type="vocos",
    show_info=print,
    progress=None,
    target_rms=0.1,
    cross_fade_duration=0.0,  # Desabilitado para evitar pausas longas
    nfe_step=32,              # ‚úÖ Padr√£o da biblioteca (treinamento)
    cfg_strength=2.0,         # ‚úÖ Padr√£o da biblioteca (treinamento)
    sway_sampling_coef=-1.0,  # ‚úÖ Padr√£o da biblioteca (auto)
    speed=1.0,
    fix_duration=None,
    device=device
)

generation_time = time.time() - start_time

print(f"\n‚úÖ √Åudio gerado com sucesso!")
print(f"‚è±Ô∏è  Tempo de gera√ß√£o: {generation_time:.2f}s")
print(f"üìä Sample rate: {sample_rate} Hz")
print(f"üìä Dura√ß√£o do √°udio: {len(audio_output) / sample_rate:.2f}s")
print(f"üìä RTF (Real-Time Factor): {generation_time / (len(audio_output) / sample_rate):.2f}x")

## 7. Salvar e Reproduzir √Åudio Gerado

In [None]:
from datetime import datetime

# Generate filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"f5tts_test_{timestamp}.wav"
output_path = TEST_OUTPUT_DIR / output_filename

# Save audio
sf.write(str(output_path), audio_output, sample_rate)
print(f"üíæ √Åudio salvo em: {output_path}")

# Display audio player
print("\nüîä Reproduzir √°udio:")
display(Audio(audio_output, rate=sample_rate))

## 8. Visualiza√ß√£o de Spectrogram

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 10))

# 1. Waveform
axes[0].plot(audio_output)
axes[0].set_title('Waveform do √Åudio Gerado', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Samples')
axes[0].set_ylabel('Amplitude')
axes[0].grid(True, alpha=0.3)

# 2. Spectrogram
D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_output)), ref=np.max)
img = librosa.display.specshow(D, sr=sample_rate, x_axis='time', y_axis='hz', ax=axes[1])
axes[1].set_title('Spectrogram (Frequ√™ncia vs Tempo)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Frequ√™ncia (Hz)')
fig.colorbar(img, ax=axes[1], format='%+2.0f dB')

# 3. Mel Spectrogram
mel_spec = librosa.feature.melspectrogram(y=audio_output, sr=sample_rate, n_mels=128)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
img2 = librosa.display.specshow(mel_spec_db, sr=sample_rate, x_axis='time', y_axis='mel', ax=axes[2])
axes[2].set_title('Mel Spectrogram', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Mel Frequency')
axes[2].set_xlabel('Tempo (s)')
fig.colorbar(img2, ax=axes[2], format='%+2.0f dB')

plt.tight_layout()
plt.savefig(TEST_OUTPUT_DIR / f"spectrogram_{timestamp}.png", dpi=150, bbox_inches='tight')
plt.show()

print(f"üìä Spectrogram salvo em: {TEST_OUTPUT_DIR / f'spectrogram_{timestamp}.png'}")

## 9. An√°lise de Qualidade de √Åudio

In [None]:
def analyze_audio_quality(audio, sr):
    """An√°lise completa de qualidade de √°udio"""
    
    # RMS (Volume)
    rms = np.sqrt(np.mean(audio**2))
    
    # Peak amplitude
    peak = np.max(np.abs(audio))
    
    # Clipping detection
    clipping_count = np.sum(np.abs(audio) > 0.99)
    clipping_percentage = (clipping_count / len(audio)) * 100
    
    # Zero crossing rate (naturalness indicator)
    zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
    
    # Spectral centroid (brightness)
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
    
    # Spectral rolloff
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr))
    
    # SNR estimate (simplified)
    signal_power = np.mean(audio**2)
    noise_estimate = np.mean(np.abs(audio[audio < np.percentile(audio, 10)])**2)
    snr_db = 10 * np.log10(signal_power / (noise_estimate + 1e-10))
    
    return {
        'rms': rms,
        'peak': peak,
        'clipping_count': clipping_count,
        'clipping_percentage': clipping_percentage,
        'zero_crossing_rate': zcr,
        'spectral_centroid': spectral_centroid,
        'spectral_rolloff': spectral_rolloff,
        'snr_db': snr_db
    }

# Analyze generated audio
metrics = analyze_audio_quality(audio_output, sample_rate)

print("=" * 60)
print("üìä M√âTRICAS DE QUALIDADE DO √ÅUDIO GERADO")
print("=" * 60)
print(f"üîä RMS (Volume):              {metrics['rms']:.4f}")
print(f"üìà Peak Amplitude:            {metrics['peak']:.4f}")
print(f"‚ö†Ô∏è  Clipping Samples:          {metrics['clipping_count']} ({metrics['clipping_percentage']:.2f}%)")
print(f"üåä Zero Crossing Rate:        {metrics['zero_crossing_rate']:.4f}")
print(f"‚ú® Spectral Centroid:         {metrics['spectral_centroid']:.0f} Hz")
print(f"üìä Spectral Rolloff:          {metrics['spectral_rolloff']:.0f} Hz")
print(f"üì° SNR (estimado):            {metrics['snr_db']:.1f} dB")
print("=" * 60)

# Quality assessment
quality_score = 0
issues = []

if metrics['rms'] > 0.05 and metrics['rms'] < 0.3:
    quality_score += 25
else:
    issues.append(f"Volume fora do ideal (RMS: {metrics['rms']:.3f})")

if metrics['clipping_percentage'] < 0.1:
    quality_score += 25
else:
    issues.append(f"Clipping detectado ({metrics['clipping_percentage']:.2f}%)")

if metrics['spectral_centroid'] > 500 and metrics['spectral_centroid'] < 3000:
    quality_score += 25
else:
    issues.append(f"Centr√≥ide espectral incomum ({metrics['spectral_centroid']:.0f} Hz)")

if metrics['snr_db'] > 20:
    quality_score += 25
else:
    issues.append(f"SNR baixo ({metrics['snr_db']:.1f} dB)")

print(f"\nüéØ SCORE DE QUALIDADE: {quality_score}/100")

if quality_score >= 75:
    print("‚úÖ Qualidade: EXCELENTE")
elif quality_score >= 50:
    print("‚ö†Ô∏è  Qualidade: BOA (com ressalvas)")
else:
    print("‚ùå Qualidade: PRECISA MELHORAR")

if issues:
    print("\n‚ö†Ô∏è  Problemas detectados:")
    for issue in issues:
        print(f"  - {issue}")

## 10. Compara√ß√£o com Sample do Treinamento (Opcional)

In [None]:
# Compare with training sample
training_sample_path = SAMPLES_DIR / "update_33200_gen.wav"

if training_sample_path.exists():
    print("üìä Comparando com sample do treinamento...\n")
    
    # Load training sample
    training_audio, training_sr = sf.read(str(training_sample_path))
    
    # Analyze both
    metrics_generated = analyze_audio_quality(audio_output, sample_rate)
    metrics_training = analyze_audio_quality(training_audio, training_sr)
    
    # Create comparison table
    print("=" * 80)
    print(f"{'M√©trica':<30} | {'Gerado Agora':<20} | {'Sample Treinamento':<20}")
    print("=" * 80)
    print(f"{'RMS (Volume)':<30} | {metrics_generated['rms']:<20.4f} | {metrics_training['rms']:<20.4f}")
    print(f"{'Peak Amplitude':<30} | {metrics_generated['peak']:<20.4f} | {metrics_training['peak']:<20.4f}")
    print(f"{'Clipping %':<30} | {metrics_generated['clipping_percentage']:<20.2f} | {metrics_training['clipping_percentage']:<20.2f}")
    print(f"{'Zero Crossing Rate':<30} | {metrics_generated['zero_crossing_rate']:<20.4f} | {metrics_training['zero_crossing_rate']:<20.4f}")
    print(f"{'Spectral Centroid (Hz)':<30} | {metrics_generated['spectral_centroid']:<20.0f} | {metrics_training['spectral_centroid']:<20.0f}")
    print(f"{'SNR (dB)':<30} | {metrics_generated['snr_db']:<20.1f} | {metrics_training['snr_db']:<20.1f}")
    print("=" * 80)
    
    print("\nüîä Reproduzir sample do treinamento para compara√ß√£o:")
    display(Audio(str(training_sample_path)))
    
else:
    print(f"‚ö†Ô∏è  Sample do treinamento n√£o encontrado: {training_sample_path}")

## 11. Teste com Diferentes Par√¢metros (Experimental)

In [None]:
def generate_with_params(nfe_step, cfg_strength, sway_coef, label):
    """Gera √°udio com par√¢metros espec√≠ficos"""
    print(f"\nüéôÔ∏è Gerando: {label}")
    print(f"   nfe_step={nfe_step}, cfg_strength={cfg_strength}, sway={sway_coef}")
    
    start = time.time()
    audio, sr, _ = infer_process(
        ref_audio=ref_audio,
        ref_text=ref_text_processed,
        gen_text="Teste r√°pido de s√≠ntese com diferentes par√¢metros.",
        model_obj=model,
        vocoder=vocoder,
        mel_spec_type="vocos",
        show_info=lambda x: None,  # Suprimir logs
        progress=None,
        target_rms=0.1,
        cross_fade_duration=0.0,
        nfe_step=nfe_step,
        cfg_strength=cfg_strength,
        sway_sampling_coef=sway_coef,
        speed=1.0,
        device=device
    )
    elapsed = time.time() - start
    
    # Save
    output_file = TEST_OUTPUT_DIR / f"test_{label.replace(' ', '_').lower()}_{timestamp}.wav"
    sf.write(str(output_file), audio, sr)
    
    print(f"   ‚è±Ô∏è  Tempo: {elapsed:.2f}s | RTF: {elapsed / (len(audio) / sr):.2f}x")
    print(f"   üíæ Salvo: {output_file.name}")
    
    return audio, sr, output_file

# Test different configurations
print("=" * 80)
print("üß™ TESTE DE DIFERENTES CONFIGURA√á√ïES")
print("=" * 80)

configs = [
    (32, 2.0, -1.0, "BALANCED (Default - Training Match)"),
    (16, 1.5, -1.0, "FAST (R√°pido)"),
    (48, 2.5, -1.0, "HIGH_QUALITY (Alta Qualidade)"),
    (64, 2.0, -1.0, "ULTRA_QUALITY (Qualidade M√°xima)"),
]

results = []
for nfe, cfg, sway, label in configs:
    audio_test, sr_test, file_path = generate_with_params(nfe, cfg, sway, label)
    results.append((label, audio_test, sr_test, file_path))

print("\n" + "=" * 80)
print("‚úÖ Testes conclu√≠dos! Reproduza abaixo para comparar:")
print("=" * 80)

In [None]:
# Play all test results
for label, audio_test, sr_test, file_path in results:
    print(f"\nüîä {label}")
    print(f"   üìÅ {file_path.name}")
    display(Audio(audio_test, rate=sr_test))

## 12. Exportar para MP3 (Opcional)

In [None]:
try:
    from pydub import AudioSegment
    
    # Convert main output to MP3
    mp3_path = output_path.with_suffix('.mp3')
    audio_segment = AudioSegment.from_wav(str(output_path))
    audio_segment.export(str(mp3_path), format='mp3', bitrate='192k')
    
    print(f"‚úÖ MP3 exportado: {mp3_path}")
    print(f"üìä Tamanho WAV: {output_path.stat().st_size / 1024:.1f} KB")
    print(f"üìä Tamanho MP3: {mp3_path.stat().st_size / 1024:.1f} KB")
    
except ImportError:
    print("‚ö†Ô∏è  pydub n√£o instalado. Para exportar MP3:")
    print("   pip install pydub")
    print("   Tamb√©m precisa do ffmpeg instalado no sistema")

## 13. Resumo e Pr√≥ximos Passos

### ‚úÖ O que fizemos neste notebook:

1. ‚úÖ Carregamos o modelo F5-TTS fine-tuned customizado
2. ‚úÖ Geramos √°udio com voz clonada usando par√¢metros otimizados
3. ‚úÖ Visualizamos spectrograms e waveforms
4. ‚úÖ Analisamos m√©tricas de qualidade de √°udio
5. ‚úÖ Comparamos com samples do treinamento
6. ‚úÖ Testamos diferentes configura√ß√µes de par√¢metros

### üéØ Pr√≥ximos Passos:

**Para testar outros textos:**
- Volte √† c√©lula 5 e mude o `gen_text`
- Execute novamente a partir da c√©lula 6

**Para usar outra voz de refer√™ncia:**
- Volte √† c√©lula 4 e mude o `ref_audio_path`
- Atualize o `ref_text` com a transcri√ß√£o correta
- Re-execute a partir da c√©lula 6

**Para ajustar qualidade vs velocidade:**
- Experimente diferentes valores na c√©lula 11
- `nfe_step=16`: R√°pido mas menor qualidade
- `nfe_step=32`: Padr√£o (match com treinamento)
- `nfe_step=64`: Qualidade m√°xima mas mais lento

### üìö Documenta√ß√£o:

- **Par√¢metros F5-TTS:** `docs/F5TTS_QUALITY_FIX.md`
- **Profiles de Qualidade:** `app/quality_profiles.py`
- **Training Samples:** `train/output/ptbr_finetuned2/samples/`

### üêõ Troubleshooting:

**Se o √°udio tiver artefatos:**
- Verifique se `sway_sampling_coef=-1.0` (n√£o use valores positivos!)
- Tente aumentar `nfe_step` para 48 ou 64
- Certifique-se que `ref_text` √© a transcri√ß√£o EXATA do √°udio de refer√™ncia

**Se estiver muito lento:**
- Reduza `nfe_step` para 16 ou 24
- Use GPU se dispon√≠vel (device='cuda')

**Se a voz n√£o est√° sendo clonada bem:**
- Use √°udio de refer√™ncia de 10-30 segundos
- Forne√ßa transcri√ß√£o exata (`ref_text`)
- Certifique-se que o √°udio tem boa qualidade (sem ru√≠do)