In [1]:
from utils import process_text

In [2]:
import os 
os.chdir('..')
print(os.getcwd())
print(os.listdir())

/home/matcha/new_matcha/Matcha-TTS
['output', '.github', 'scripts', 'synthesis.ipynb', '.ipynb_checkpoints', '.env.example', 'notebook', 'matcha_ljspeech.ckpt', '.project-root', 'checkpoint_epoch=100.ckpt', 'checkpoints', '.git', '.gitignore', 'MANIFEST.in', 'weight_distribution.png', 'requirements.txt', 'LICENSE', '.pre-commit-config.yaml', 'configs', 'logs', 'Loss.ipynb', 'data', 'matcha_tts.egg-info', 'val_synthesis_results', 'notebooks', 'train2.log', 'README.md', 'setup.py', '.pylintrc', 'pyproject.toml', 'Makefile', 'matcha']


In [3]:
import torch
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
import soundfile as sf
import sys 

In [4]:
sys.path.append('Matcha_TTS_main/') # add the Matcha_TTS_main (code from the original repo) directory to the path


In [5]:
from matcha.models.matcha_tts import MatchaTTS 
# for Hifigan
from matcha.hifigan.config import v1
from matcha.hifigan.denoiser import Denoiser
from matcha.hifigan.env import AttrDict
from matcha.hifigan.models import Generator as HiFiGAN

In [7]:
# load the model checkpoints
matcha_checkpoint_path = "logs/train/ljspeech/runs/2026-01-09_12-40-08/checkpoints/last.ckpt"
hifigan_checkpoint_path = "checkpoints/generator_v1"

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
def load_model(checkpoint_path):
    model = MatchaTTS.load_from_checkpoint(
        checkpoint_path, 
        map_location=device,
        weights_only=False 
    )
    model.eval()
    return model


model = load_model(matcha_checkpoint_path)


  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)


In [10]:
def load_vocoder(checkpoint_path):
    h = AttrDict(v1)
    hifigan = HiFiGAN(h).to(device)
    hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])
    _ = hifigan.eval()
    hifigan.remove_weight_norm()
    return hifigan

vocoder = load_vocoder(hifigan_checkpoint_path)
denoiser = Denoiser(vocoder, mode='zeros')

  WeightNorm.apply(module, name, dim)
  hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])


Removing weight norm...


In [19]:
output_dir = "../eval_audio"
os.makedirs(output_dir, exist_ok=True)

In [20]:
## Number of ODE Solver steps
n_timesteps = 10

## Changes to the speaking rate
length_scale=1.0

## Sampling temperature
temperature = 0.667

In [21]:
# Extraits de votre val.txt
val_phrases = {
    "LJ002-0278": "while the funds of several bequests and charities were applied in adding to the material comfort of the prisoners.",
    "LJ019-0344": "monitor, or schoolmaster, nor to be engaged in the service of any officer of the prison.",
    "LJ018-0074": "Sattler probably misunderstood, and he declared that the police officer had broken faith with him, having, moreover, stated that",
    "LJ014-0111": "At the second exact information was obtained of Mrs. Manning's movements. She had gone to Edinburgh.",
    "LJ047-0171": "Mrs. Paine had nothing to add to what she had already told him, except that during a visit that past weekend,",
    "LJ010-0256": "Undeterred by the well-merited punishment which had overtaken Francis,",
    "LJ047-0140": "advising that an individual tentatively identified as Oswald had been in touch with the Soviet Embassy in Mexico City",
    "LJ045-0099": "She had, quote, the impression that he was just pushed, pushed, pushed, and she Marina Oswald was probably nagging, nagging, nagging, end quote.",
    "LJ013-0098": "Mr. Oxenford having denied that he had made any transfer of stock, the matter was at once put into the hands of the police.",
    "LJ050-0167": "to develop coordinated and mutually compatible systems, even where such coordination would not seem inconsistent"
}

print(f"Lancement de la synthèse sur l'ensemble de validation ({len(val_phrases)} phrases)...")

Lancement de la synthèse sur l'ensemble de validation (10 phrases)...


In [22]:

def to_waveform(mel, vocoder, denoiser, device):
    """Convert mel to waveform with HiFi-GAN, handling shape/device safely."""
    if mel.dim() == 2:
        mel = mel.unsqueeze(0)
    if mel.shape[1] != 80 and mel.shape[2] == 80:
        mel = mel.transpose(1, 2)

    mel = mel.to(device=device, dtype=torch.float32)
    vocoder = vocoder.to(device)

    with torch.inference_mode():
        audio = vocoder(mel).clamp(-1, 1)
        audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze()
    return audio





# Initialiser une liste pour stocker les RTF de chaque phrase
rtf_list = []

for audio_id, text in val_phrases.items():
    result = process_text(text, device)
    
    start_t = dt.datetime.now()
    
    with torch.inference_mode():
        # B. Inférence Matcha-TTS
        output = model.synthesise(
            result['x'], 
            result['x_lengths'], 
            n_timesteps=n_timesteps, 
            temperature=temperature, 
            length_scale=length_scale
        )
        
        # C. Vocodage + Denoising (Utilisation de ta fonction to_waveform)
        audio_denoised = to_waveform(output['mel'], vocoder, denoiser, device)
    
    # --- FIN CHRONO ---
    synthesis_time = (dt.datetime.now() - start_t).total_seconds()
    
    # Calcul de la durée de l'audio généré (en secondes)
    # Formule : nombre d'échantillons / fréquence d'échantillonnage (22050 Hz ici)
    audio_duration_secs = audio_denoised.shape[-1] / 22050
    
    # Calcul du RTF pour cette phrase
    rtf = synthesis_time / audio_duration_secs
    rtf_list.append(rtf)
    
    # D. Sauvegarde
    filename = os.path.join(output_dir, f"{audio_id}_matcha.wav")
    sf.write(filename, audio_denoised.numpy(), 22050)
    
    print(f"Phrase {audio_id} | Temps: {synthesis_time:.2f}s | Audio: {audio_duration_secs:.2f}s | RTF: {rtf:.4f}")

# --- CALCUL DE LA MOYENNE ---
mean_rtf = np.mean(rtf_list)
print("\n" + "="*30)
print(f"TERMINÉ !")
print(f"Mean RTF: {mean_rtf:.4f}")
if mean_rtf < 1:
    print(f"Le modèle est plus rapide que le temps réel ! (x{1/mean_rtf:.2f} speed)")
print("="*30)

Processing complete!
Phrase LJ002-0278 | Temps: 0.12s | Audio: 6.65s | RTF: 0.0176
Processing complete!
Phrase LJ019-0344 | Temps: 0.12s | Audio: 5.94s | RTF: 0.0194
Processing complete!
Phrase LJ018-0074 | Temps: 0.13s | Audio: 8.44s | RTF: 0.0151
Processing complete!
Phrase LJ014-0111 | Temps: 0.12s | Audio: 6.58s | RTF: 0.0176
Processing complete!
Phrase LJ047-0171 | Temps: 0.12s | Audio: 6.94s | RTF: 0.0177
Processing complete!
Phrase LJ010-0256 | Temps: 0.11s | Audio: 4.63s | RTF: 0.0246
Processing complete!
Phrase LJ047-0140 | Temps: 0.12s | Audio: 7.58s | RTF: 0.0162
Processing complete!
Phrase LJ045-0099 | Temps: 0.12s | Audio: 9.56s | RTF: 0.0130
Processing complete!
Phrase LJ013-0098 | Temps: 0.13s | Audio: 8.03s | RTF: 0.0160
Processing complete!
Phrase LJ050-0167 | Temps: 0.13s | Audio: 7.95s | RTF: 0.0158

TERMINÉ !
Mean RTF: 0.0173
Le modèle est plus rapide que le temps réel ! (x57.77 speed)


In [23]:
import os
import whisper
from jiwer import wer
import re
import numpy as np
import time

# 1. Tes phrases LJSpeech
val_phrases = {
    "LJ002-0278": "while the funds of several bequests and charities were applied in adding to the material comfort of the prisoners.",
    "LJ019-0344": "monitor, or schoolmaster, nor to be engaged in the service of any officer of the prison.",
    "LJ018-0074": "Sattler probably misunderstood, and he declared that the police officer had broken faith with him, having, moreover, stated that",
    "LJ014-0111": "At the second exact information was obtained of Mrs. Manning's movements. She had gone to Edinburgh.",
    "LJ047-0171": "Mrs. Paine had nothing to add to what she had already told him, except that during a visit that past weekend,",
    "LJ010-0256": "Undeterred by the well-merited punishment which had overtaken Francis,",
    "LJ047-0140": "advising that an individual tentatively identified as Oswald had been in touch with the Soviet Embassy in Mexico City",
    "LJ045-0099": "She had, quote, the impression that he was just pushed, pushed, pushed, and she Marina Oswald was probably nagging, nagging, nagging, end quote.",
    "LJ013-0098": "Mr. Oxenford having denied that he had made any transfer of stock, the matter was at once put into the hands of the police.",
    "LJ050-0167": "to develop coordinated and mutually compatible systems, even where such coordination would not seem inconsistent"
}

# 2. Paramètres
asr_model_name = "medium"    # Utilisation de Whisper Medium pour la précision

# 3. Chargement de Whisper
print(f"Chargement de Whisper {asr_model_name}...")
asr_model = whisper.load_model(asr_model_name)

def normalize(text):
    text = text.lower()
    return re.sub(r'[^\w\s]', '', text).strip()


Chargement de Whisper medium...


In [24]:

# 4. Boucle d'évaluation
stats = []

print(f"\nÉvaluation de {len(val_phrases)} phrases en cours...")
print("-" * 60)

for audio_id, original_text in val_phrases.items():
    audio_path = os.path.join(output_dir, f"{audio_id}_matcha.wav")

    if not os.path.exists(audio_path):
        print(f"Erreur : {audio_path} n'existe pas.")
        continue

    # A. Transcription ASR
    result = asr_model.transcribe(audio_path)
    predicted_text = result["text"]

    # B. Calcul du WER
    ground_truth = normalize(original_text)
    hypothesis = normalize(predicted_text)
    current_wer = wer(ground_truth, hypothesis)
    
    stats.append(current_wer)

    print(f"[{audio_id}]")
    print(f"  Vrai : {original_text[:70]}...")
    print(f"  Pred : {predicted_text[:70]}...")
    print(f"  WER  : {current_wer * 100:.2f} %")
    print("-" * 30)

# 5. Résumé Final
if stats:
    mean_wer = np.mean(stats)
    print(f"\n{'='*50}")
    print(f"RAPPORT DE QUALITÉ FINAL")
    print(f"{'='*50}")
    print(f"Nombre de phrases testées : {len(stats)}")
    print(f"WER MOYEN                 : {mean_wer * 100:.2f} %")
    
    # Interprétation
    if mean_wer < 0.05:
        print("Interprétation : Excellente intelligibilité.")
    elif mean_wer < 0.15:
        print("Interprétation : Bonne intelligibilité (niveau humain).")
    else:
        print("Interprétation : Des erreurs de synthèse sont présentes.")
    print(f"{'='*50}")


Évaluation de 10 phrases en cours...
------------------------------------------------------------
[LJ002-0278]
  Vrai : while the funds of several bequests and charities were applied in addi...
  Pred :  while the funds of several bequests and charities were applied in add...
  WER  : 0.00 %
------------------------------
[LJ019-0344]
  Vrai : monitor, or schoolmaster, nor to be engaged in the service of any offi...
  Pred :  monitor or schoolmaster, nor to be engaged in the service of any offi...
  WER  : 0.00 %
------------------------------
[LJ018-0074]
  Vrai : Sattler probably misunderstood, and he declared that the police office...
  Pred :  Sattler probably misunderstood, and he declared that the police offic...
  WER  : 0.00 %
------------------------------
[LJ014-0111]
  Vrai : At the second exact information was obtained of Mrs. Manning's movemen...
  Pred :  at the second exact information was obtained of Mrs. Manning's moveme...
  WER  : 0.00 %
----------------------------