In [1]:
%cd ../..
from inference import StyleTTS2

import librosa
import IPython.display as ipd
import torch.cuda

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'

d:\Project\TTS\StyleTTS2-lite


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\catto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Load G2P

If you did not use eSpeak for your language, please add your own G2P.

In [2]:
import sys
import phonemizer
if sys.platform.startswith("win"):
    try:
        from phonemizer.backend.espeak.wrapper import EspeakWrapper
        import espeakng_loader
        EspeakWrapper.set_library(espeakng_loader.get_library_path())
    except Exception as e:
        print(e)

def get_phoneme(text, lang):
    try:
        my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True,  with_stress=True, language_switch='remove-flags')
        return my_phonemizer.phonemize([text])[0]
    except Exception as e:
        print(e)

### Load models

In [None]:
config_path = "Configs/config.yaml"
models_path = "Models/Finetune/current_model_100k.pth"

### Synthesize speech

Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed.

In [4]:
speaker = {
    "path": "./Demo/Audio/1_heart.wav",  #Ref audio path
    "speed": 1.0,                        #Speaking speed
}

max_samples = 24000*20 #max 20 seconds ref audio
print(speaker['path'])
wave, sr = librosa.load(speaker['path'], sr=24000)
audio, index = librosa.effects.trim(wave, top_db=30)
if sr != 24000:              audio = librosa.resample(audio, sr, 24000)
if len(audio) > max_samples: audio = audio[:max_samples]
display(ipd.Audio(audio, rate=24000, normalize=True))

./Demo/Audio/1_heart.wav


In [5]:
text = '''
Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.
Aix-Marseille launched the "Safe Place for Science" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.
'''

| Parameter      | Type   | Description                                                                 | Performance Impact                                |
|----------------|--------|-----------------------------------------------------------------------------|----------------------------------------------------|
| `avg_style`     | BOOL   | Split the reference audio and calculate the average speaking style.        | Higher computation during style extraction         |
| `denoise`       | FLOAT  | Adjusts denoiser strength; range [0, 1].                                   | Additional computation for style processing        |
| `stabilize`     | BOOL   | Stabilizes speaking speed for long-form synthesis.                         | Slight additional computation                      |
| `n_merge`       | INT    | Avoids short sentences by merging if words < `n_merge`.                   | Higher VRAM usage as value increases               |


In [None]:
model             = StyleTTS2(config_path, models_path).eval().to(device)
avg_style         = True
denoise           = 0.3
stabilize         = False    
n_merge           = 16        



decoder : 54289492
predictor : 16194612
text_encoder : 5606400
style_encoder : 13845440

Total : 89935944


In [7]:
with torch.no_grad():
    phonemes = get_phoneme(text=text, lang="en-us")
    styles  = model.get_styles(speaker, denoise, avg_style)
    r       = model.generate(phonemes, styles, stabilize, n_merge)

print('Synthesized:')
display(ipd.Audio(r, rate=24000, normalize=True))

Computing the style for: ./Demo/Audio/1_heart.wav
Generating Audio...
Synthesized:


Optional: The styles tensor can be saved and re-use later for faster generation time.

In [8]:
model.save_styles(save_dir="./style1.pt")
model.load_styles(save_dir="./style1.pt")

Saved styles!
Loaded styles!


In [9]:
with torch.no_grad():
    phonemes = get_phoneme(text=text, lang="en-us")
    styles  = model.get_styles(speaker, load_styles=True)
    r       = model.generate(phonemes, styles, stabilize, n_merge)

print('Synthesized:')
display(ipd.Audio(r, rate=24000, normalize=True))

Generating Audio...
Synthesized:
