In [1]:
from inference_onnx import StyleTTS2, Preprocess
import onnxruntime
import librosa
import IPython.display as ipd
import torch.cuda

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Load G2P

If you did not use eSpeak for your language, please add your own G2P.

In [2]:
import sys
import phonemizer
if sys.platform.startswith("win"):
    from phonemizer.backend.espeak.wrapper import EspeakWrapper
    import espeakng_loader
    EspeakWrapper.set_library(espeakng_loader.get_library_path())

def get_phoneme(text, lang):
    try:
        my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True,  with_stress=True, language_switch='remove-flags')
        return my_phonemizer.phonemize([text])[0]
    except Exception as e:
        print(e)

### Load models

In [3]:
config_path = "../Models/Pretrained/hifi/en/config.yaml"
models_path = "../Models/Pretrained/hifi/en/libri_100k.pth"

### Synthesize speech

In [4]:
speaker = {
    "path": "../Demo/Audio/15_liam.wav",  #Ref audio path
    "speed": 1.0,                        #Speaking speed
}

max_samples = 24000*20 #max 20 seconds ref audio
print(speaker['path'])
wave, sr = librosa.load(speaker['path'], sr=24000)
audio, index = librosa.effects.trim(wave, top_db=30)
if sr != 24000:              audio = librosa.resample(audio, sr, 24000)
if len(audio) > max_samples: audio = audio[:max_samples]
display(ipd.Audio(audio, rate=24000, normalize=True))

../Demo/Audio/15_liam.wav


In [5]:
text = '''
Nearly 300 scholars currently working in the United States have applied for positions at Aix Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies. Aix Marseille launched the Safe Place for Science initiative earlier this year, offering three year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted US academic institutions.
'''
phonemes = get_phoneme(text=text, lang="en-us")

#### Pytorch

In [6]:
model = StyleTTS2(config_path, models_path, device=device)
preprocess = Preprocess(config_path, models_path, device=device)

with torch.no_grad():
    tokens, mel, speed = preprocess.preprocess_input(phonemes, speaker)
    style              = preprocess.get_style(mel)

    r = model(tokens, style, speed).cpu().numpy()

print('Synthesized:')
display(ipd.Audio(r, rate=24000, normalize=True))

  WeightNorm.apply(module, name, dim)



Found: 178 symbols
Synthesized:


#### ONNX

In [7]:
import numpy as np
def to_numpy(x):
    if isinstance(x, float):
        return np.array([x], dtype=np.float32)
    elif hasattr(x, 'cpu'):
        return x.cpu().numpy()
    elif isinstance(x, np.ndarray):
        return x
    else:
        raise TypeError(f"Unsupported type for ONNX input: {type(x)}")

In [None]:
styletts2_session = onnxruntime.InferenceSession("styletts.onnx")
preprocess = Preprocess(config_path, models_path, device=device)
#still needs pytorch weights to load style_encoder.
#TODO: export style_encoder to ONNX seperately.

with torch.no_grad():
    tokens, mel, speed = preprocess.preprocess_input(phonemes, speaker)
    style              = preprocess.get_style(mel)
model_inputs = {
    "tokens": to_numpy(tokens),
    "style": to_numpy(style),
    "speed": to_numpy(speed)
}
wav = styletts2_session.run(None, model_inputs)
print('Synthesized:')
display(ipd.Audio(wav, rate=24000, normalize=True))


Found: 178 symbols
Synthesized:
