In [None]:
# 0) Set up paths - use absolute path to repo root
import os
repo_root = r'd:\Tesis\StyleTTS2'
os.chdir(repo_root)
print('Changed to directory:', os.getcwd())

# 1) Imports and deterministic setup
import time
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
import soundfile as sf

# deterministic seeds for reproducibility (optional)
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)
print('Current working directory:', os.getcwd())

Changed to directory: d:\
Device: cuda
Current working directory: d:\


In [16]:
# 2) helper functions and audio params
sr = 24000
mean, std = -4, 4

# mel transform used across repo
to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)

def length_to_mask(lengths):
    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
    mask = torch.gt(mask+1, lengths.unsqueeze(1))
    return mask


def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

## 3) Load Config and Set Checkpoint

By default this notebook loads `Configs/config.es.yml`. The `checkpoint_path` defaults to `os.path.join(config['log_dir'], 'epoch_2nd_00004.pth')` — change this to the checkpoint you want to use.

In [17]:
cfg_path = r'd:\Tesis\StyleTTS2\Configs\config.es.yml'
config = yaml.safe_load(open(cfg_path))
print('Loaded config:', cfg_path)
print('Config file exists:', os.path.exists(cfg_path))

# default checkpoint (change if needed)
default_ckpt = os.path.join(config['log_dir'], 'epoch_2nd_%05d.pth' % 4)
checkpoint_path = default_ckpt
print('Default checkpoint:', checkpoint_path)

# If your training log_dir uses an absolute path, update root_path in config or set checkpoint_path manually.

Loaded config: d:\Tesis\StyleTTS2\Configs\config.es.yml
Config file exists: True
Default checkpoint: logs/angelina_es\epoch_2nd_00004.pth


## 4) Load model components (ASR, F0, PLBERT) and build StyleTTS2 model

This replicates the same setup used during training so inference uses identical components.

In [18]:
# import model builders & helpers from repo
from models import build_model, load_checkpoint, load_F0_models, load_ASR_models
from Utils.PLBERT.util import load_plbert
from text_utils import TextCleaner
from utils import recursive_munch

# Load ASR, F0 extractor and PL-BERT according to config
ASR_config = config.get('ASR_config', None)
ASR_path = config.get('ASR_path', None)
print('ASR config:', ASR_config, 'ASR path:', ASR_path)

if ASR_config and ASR_path:
    text_aligner = load_ASR_models(ASR_path, ASR_config)
else:
    text_aligner = None

F0_path = config.get('F0_path', None)
if F0_path:
    pitch_extractor = load_F0_models(F0_path)
else:
    pitch_extractor = None

plbert_dir = config.get('PLBERT_dir', None)
plbert = load_plbert(plbert_dir) if plbert_dir else None

# Build model
model_params = recursive_munch(config['model_params'])
model = build_model(model_params, text_aligner, pitch_extractor, plbert)
_ = [model[key].to(device) for key in model]
_ = [model[key].eval() for key in model]
print('Model built and moved to device')

ModuleNotFoundError: No module named 'models'

## 5) Load checkpoint weights

If a checkpoint uses a `module.` prefix (from DataParallel), the loader will handle that.

In [None]:
import torch
from collections import OrderedDict

if not os.path.exists(checkpoint_path):
    print('Checkpoint not found:', checkpoint_path)
    print('List files in', config['log_dir'], 'to choose a different checkpoint:')
    try:
        print('\n'.join(sorted(os.listdir(config['log_dir']))))
    except Exception as e:
        print('Could not list directory:', e)
else:
    print('Loading checkpoint:', checkpoint_path)
    params_whole = torch.load(checkpoint_path, map_location='cpu')
    params = params_whole.get('net', params_whole)

    # try loading directly, otherwise handle DataParallel `module.` prefix
    for key in model:
        if key in params:
            try:
                model[key].load_state_dict(params[key])
                print(f'{key} loaded')
            except Exception:
                # try removing `module.` prefix
                sd = params[key]
                new_sd = OrderedDict()
                for k, v in sd.items():
                    name = k[7:] if k.startswith('module.') else k
                    new_sd[name] = v
                model[key].load_state_dict(new_sd, strict=False)
                print(f'{key} loaded with module.* prefix removal')

    print('Checkpoint loaded (weights).')

## 6) Build Diffusion Sampler

In [None]:
from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

sampler = DiffusionSampler(
    model.diffusion.diffusion,
    sampler=ADPM2Sampler(),
    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0),
    clamp=False
)
print('Sampler ready')

## 7) Prepare phonemizer (Spanish) and TextCleaner

We use eSpeak phonemizer for Spanish. Ensure `phonemizer` package is installed in your environment. If not available, you can feed raw characters instead (less accurate prosody).

In [None]:
try:
    import phonemizer
    from phonemizer.backend import EspeakBackend
    global_phonemizer = EspeakBackend(language='es', preserve_punctuation=True, with_stress=True)
    print('Phonemizer loaded (es)')
except Exception as e:
    print('Phonemizer not available:', e)
    global_phonemizer = None

textcleaner = TextCleaner()

# NLTK tokenizer will be used for tokenization
import nltk

print('Text utilities ready')

## 8) Inference function

This function mirrors the LJSpeech demo: it tokenizes / phonemizes Spanish text, gets BERT embedding, samples a style vector via the diffusion sampler, predicts durations and prosody, then uses the decoder to synthesize waveform.

In [None]:
from nltk.tokenize import word_tokenize
import IPython.display as ipd


def inference(text, noise=None, diffusion_steps=5, embedding_scale=1.0):
    text = text.strip()
    text = text.replace('"', '')

    if global_phonemizer is not None:
        ps = global_phonemizer.phonemize([text])
        ps = word_tokenize(ps[0])
        ps = ' '.join(ps)
    else:
        # fallback: basic whitespace tokenizer
        ps = ' '.join(word_tokenize(text))

    tokens = textcleaner(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

    with torch.no_grad():
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
        text_mask = length_to_mask(input_lengths).to(tokens.device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

        if noise is None:
            noise = torch.randn(1,1,256).to(device)

        s_pred = sampler(noise,
                         embedding=bert_dur[0].unsqueeze(0),
                         num_steps=diffusion_steps,
                         embedding_scale=embedding_scale).squeeze(0)

        s = s_pred[:, 128:]
        ref = s_pred[:, :128]

        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)
        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)

        pred_dur[-1] += 5

        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
        out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),
                            F0_pred, N_pred, ref.squeeze().unsqueeze(0))

    return out.squeeze().cpu().numpy()

print('Inference function ready')

## 9) Example: synthesize a Spanish sentence

In [None]:
# Example Spanish text (edit as you like)
spanish_text = "Hola, esta es una prueba rápida del modelo StyleTTS2 entrenado con datos en español."

# generate noise and run inference
noise = torch.randn(1,1,256).to(device)
start = time.time()
wav = inference(spanish_text, noise=noise, diffusion_steps=5, embedding_scale=1.0)
print('Synthesis time:', time.time() - start, 'seconds')

# display audio widget
display(ipd.Audio(wav, rate=sr))

# save output to disk
out_path = 'Demo/output_spanish.wav'
sf.write(out_path, wav, sr)
print('Saved:', out_path)

## 10) Optional: longer generation or different styles

You can increase `diffusion_steps` for more diverse / higher-quality samples (slower), or change `embedding_scale` to control expressiveness.