## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [1]:
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd
import os
import sys
sys.path.append(os.path.join(sys.path[0],'waveglow/'))
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

In [2]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')

#### Setup hparams

In [3]:
hparams = create_hparams()
hparams.sampling_rate = 22050

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



#### Load model from checkpoint

In [4]:
checkpoint_path = "tacotron2_statedict.pt"
use_cuda = True
model = load_model(hparams,use_cuda)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
model.cuda().eval().half() if use_cuda else model.eval()

Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (lin

#### Load WaveGlow for mel2audio synthesis and denoiser

In [5]:
waveglow_path = 'waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.use_cuda = use_cuda
waveglow.cuda().eval().half() if use_cuda else waveglow.eval()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow,use_cuda=use_cuda)

Infer: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 16.10it/s]


#### Prepare text input

In [6]:
text = "77777777777!"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
#sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
device = torch.device('cuda' if use_cuda else 'cpu')
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).to(device).long()

#### Decode text input and plot results

In [7]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

#### Synthesize audio from spectrogram using WaveGlow

In [8]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

Infer: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 151.87it/s]


#### (Optional) Remove WaveGlow bias

In [9]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 

In [10]:
wav = audio[0].data.cpu().numpy()
wav = (wav/np.amax(wav) * 32767).astype(np.int16)

In [11]:
import pygame
pygame.mixer.quit()
pygame.mixer.init(frequency=22050,size=-16, channels=1)
channel = pygame.mixer.Channel(0)
sound = pygame.mixer.Sound(wav)
channel.queue(sound)

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [12]:
channel.stop()

In [13]:
from PyQt5.QtCore import QCoreApplication
from PyQt5.QtMultimedia import QSound

app = QCoreApplication(sys.argv)

In [14]:
sound = QSound(wav)
sound.play()
sys.exit(app.exec_())

TypeError: QSound(str, parent: QObject = None): argument 1 has unexpected type 'numpy.ndarray'