## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [1]:
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt
import re

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence, cmudict
from denoiser import Denoiser

In [2]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')

#### Setup hparams

In [3]:
hparams = create_hparams()
hparams.sampling_rate = 22050

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



#### Load model from checkpoint

In [4]:
checkpoint_path = "/home/burning/Workspace/Project/tacotron2/checkpoint_42000"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [5]:
waveglow_path = '/home/burning/Workspace/Project/tacotron2/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)



#### Prepare text input

In [8]:
_cmudict = cmudict.CMUDict("/home/burning/Workspace/Project/tacotron2/cmudict-0.7b", keep_ambiguous=True)

In [9]:
words = "Christmas"
arpabet = _cmudict.lookup(words)
arpabet

['K R IH1 S M AH0 S']

In [10]:
text = "She found funny socks for {K R IH1 S M AH0 S}."
target_words = "funny socks"
target_words_list = target_words.split()
for tw in target_words_list:
    arpabet = "{%s}" % (_cmudict.lookup(tw)[0])
    text = re.sub(tw, arpabet, text)
print(text)

She found {F AH1 N IY0} {S AA1 K S} for {K R IH1 S M AH0 S}.


In [20]:
text = "She’s 28 but has only had seven birthdays."
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

#### Decode text input and plot results

In [21]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

In [22]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 

#### Synthesize audio from spectrogram using WaveGlow

In [63]:
text = "They really need some thanks for that."
target_words = "some thanks"
target_words_list = target_words.split()
for tw in target_words_list:
    arpabet = "{%s}" % (_cmudict.lookup(tw)[0])
    text = re.sub(tw, arpabet, text)
print(text)
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

They really need {S AH1 M} {TH AE1 NG K S} for that.


In [27]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

#### (Optional) Remove WaveGlow bias

In [28]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 

In [18]:
test_mel = torch.from_numpy(np.load('text.npy')).half().unsqueeze(0).cuda()
with torch.no_grad():
    audio = waveglow.infer(test_mel, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)