## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [2]:
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
# from denoiser import Denoiser

In [3]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none')

#### Setup hparams

In [4]:
hparams = create_hparams()
hparams.sampling_rate = 22050

attention_dim: 128
attention_location_kernel_size: 31
attention_location_n_filters: 32
attention_rnn_dim: 1024
batch_size: 16
cudnn_benchmark: false
cudnn_enabled: true
decoder_rnn_dim: 1024
dist_backend: nccl
dist_url: tcp://localhost:54321
distributed_run: false
dynamic_loss_scaling: true
encoder_embedding_dim: 512
encoder_kernel_size: 5
encoder_n_convolutions: 3
epochs: 37
filter_length: 1024
fp16_run: false
gate_threshold: 0.5
grad_clip_thresh: 1.0
hop_length: 256
ignore_layers:
- embedding.weight
iters_per_checkpoint: 200
learning_rate: 0.001
load_mel_from_disk: false
mask_padding: true
max_decoder_steps: 1000
max_wav_value: 32768.0
mel_fmax: 8000.0
mel_fmin: 0.0
n_frames_per_step: 1
n_mel_channels: 80
n_symbols: 150
p_attention_dropout: 0.1
p_decoder_dropout: 0.1
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
prenet_dim: 256
sampling_rate: 22050
seed: 1234
symbols_embedding_dim: 512
text_cleaners:
- flowtron_cleaners
training_files: filelists/vlsp/tra

#### Load model from checkpoint

In [5]:
checkpoint_path = "./checkpoints/checkpoint_10400"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [6]:
# waveglow_path = 'waveglow_256channels.pt'
# waveglow = torch.load(waveglow_path)['model']
# waveglow.cuda().eval().half()
# for k in waveglow.convinv:
#     k.float()
# denoiser = Denoiser(waveglow)

#### Prepare text input

In [None]:
text = "Waveglow is really awesome!"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

#### Decode text input and plot results

In [None]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

#### Synthesize audio from spectrogram using WaveGlow

In [None]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

#### (Optional) Remove WaveGlow bias

In [None]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 