# TTS Inference

This notebook can be used to generate audio samples using either NeMo's pretrained models or after training NeMo TTS models. This script currently uses a two step inference procedure. First, a model is used to generate a mel spectrogram from text. Second, a model is used to generate audio from a mel spectrogram.

Currently supported models are:
Mel Spectrogram Generators:
- Tacotron 2
- Glow-TTS

Audio Generators
- Grifflin-Lim
- WaveGlow

In [None]:
# Copyright 2020 NVIDIA. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.
Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install unidecode
!pip install nemo_toolkit[tts]

!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/tts/conf/tacotron2.yaml
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/tts/conf/waveglow.yaml
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/candidate/examples/tts/conf/glow_tts.yaml
CONFIG_PATH = "conf/"

In [None]:
from pathlib import Path

try: CONFIG_PATH
except NameError: CONFIG_PATH = Path("..") / "conf"
    
# supported_spec_gen = ["tacotron2", "glow-tts"]
# supported_audio_gen = ["grifflin-lim", "waveglow"]
# supported_audio_gen_req_checkpoint = ["waveglow"]

supported_spec_gen = ["tacotron2"]
supported_audio_gen = ["waveglow"]
supported_audio_gen_req_checkpoint = ["waveglow"]

print("Choose one of the following spectrogram generators:")
print([model for model in supported_spec_gen])
spectrogram_generator = input()
print("Choose one of the following audio generators:")
print([model for model in supported_audio_gen])
audio_generator = input()

# # TODO
# spectrogram_generator = "tacotron2"
# audio_generator = "waveglow"

assert spectrogram_generator in supported_spec_gen
assert audio_generator in supported_audio_gen

# Download pre-trained checkpoints

TODO: Enable downloading pretrained models

In [None]:
# spectrogram_generator_checkpoint_path = input(f"Input the path to the {spectrogram_generator} checkpoint: ")
# if audio_generator in supported_audio_gen_req_checkpoint:
#     audio_generator_checkpoint_path = input(f"Input the path to the {audio_generator} checkpoint: ")

# TODO
audio_generator_checkpoint_path = Path.home()/"nemo/NeMo/examples/tts/experiments/1374354-Waveglow_O2_LJS_V1b/WaveGlow/2020-07-27_18-54-10/checkpoints/WaveGlow--last.ckpt"
spectrogram_generator_checkpoint_path = Path.home()/"nemo/NeMo/examples/tts/experiments/1325283-Tacotron_O0_LJS_V1b/Tacotron 2/2020-07-24_21-39-14/checkpoints/Tacotron 2--last.ckpt"

In [None]:
from omegaconf import OmegaConf
import torch
from ruamel.yaml import YAML
from nemo.collections.asr.parts import parsers

def load_spectrogram_model():
    if spectrogram_generator == "tacotron2":
        from nemo.collections.tts.models import Tacotron2Model as SpecModel
        cfg_file = Path(CONFIG_PATH) / "tacotron2.yaml"
    elif spectrogram_generator == "glow-tts":
        raise NotImplementedError
        from nemo.collections.tts.models import GlowTTSModel as SpecModel
        cfg_file = Path(CONFIG_PATH) / "glow_tts.yaml"
    else:
        raise NotImplementedError

    with open(cfg_file) as file:
        cfg = OmegaConf.load(file)
    del cfg.model["train_ds"]
    del cfg.model["validation_ds"]
    del cfg.model["optim"]
    return SpecModel(cfg=cfg.model)

def load_vocoder_model():
    if audio_generator == "waveglow":
        from nemo.collections.tts.models import WaveGlowModel as VocoderModel
        cfg_file = Path(CONFIG_PATH) / "waveglow.yaml"
    elif audio_generator == "grifflin-lim":
        raise NotImplementedError
    else:
        raise NotImplementedError

    with open(cfg_file) as file:
        cfg = OmegaConf.load(file)
    del cfg.model["train_ds"]
    del cfg.model["validation_ds"]
    del cfg.model["optim"]
    return VocoderModel(cfg=cfg.model)

spec_gen = load_spectrogram_model()
vocoder = load_vocoder_model()

spec_gen.load_state_dict(torch.load(spectrogram_generator_checkpoint_path)["state_dict"])
vocoder.load_state_dict(torch.load(audio_generator_checkpoint_path)["state_dict"])

spec_gen = spec_gen.cuda()
vocoder = vocoder.cuda()

In [None]:
def infer(spec_gen_model, vocder_model, str_input):
    parsed = spec_gen.parse(text_to_generate)
    spectrogram = spec_gen.generate_spectrogram(tokens=parsed)
    audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
    return spectrogram, audio

In [None]:
text_to_generate = input("Input what you want the model to say: ")
spec, audio = infer(spec_gen, vocoder, text_to_generate)

In [None]:
import IPython.display as ipd
import numpy as np
from PIL import Image
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt


ipd.Audio(audio.detach().cpu().numpy(), rate=22050)

In [None]:
%matplotlib inline
imshow(spec.detach().cpu().numpy()[0], origin="lower")
plt.show()