## Tacotron 2 Variant inference code 
Edit the variables **checkpoint_path**, **text** and **emotion feature** to match yours and run the entire code to generate wav.

#### Import libraries and setup matplotlib

In [None]:
import matplotlib
%matplotlib inline
import os
import matplotlib.pylab as plt
import IPython.display as ipd

import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
import sys
sys.path.append(f'{os.path.join("/", *os.getcwd().split(os.sep))}/waveglow')
from denoiser import Denoiser
from glow import WaveGlow

In [None]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    fig.show()
    for i in range(len(data)):
        # masked_data = np.ma.masked_invalid(data[i])
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')

#### Setup hparams && Load model from checkpoint

In [None]:
multi_speaker = True
multi_emotion = True
emotion_feature = True
# emotion_feature = False

In [None]:
# hparams = create_hparams()
# hparams = create_hparams(is_multi_speaker=False, is_multi_emotion=False, is_emotion_feature=False)
# hparams = create_hparams(is_multi_speaker=True, is_multi_emotion=True, is_emotion_feature=False)
hparams = create_hparams(is_multi_speaker=multi_speaker, is_multi_emotion=multi_emotion, is_emotion_feature=emotion_feature)
hparams.sampling_rate = 22050

if multi_speaker and multi_emotion and emotion_feature:
    checkpoint_path = "path/to/checkpoint"
elif multi_speaker and multi_emotion:
    checkpoint_path = "path/to/checkpoint"

model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.eval()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [None]:
waveglow_path = 'path/to/waveglow_checkpoint'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

#### Prepare text input

In [None]:
text = "If they mother knew it."

sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

# eid2efeature_name={
#             0: 'neutral',
#             1: 'angry',
#             2: 'happy',
#             3: 'sad',
#             4: 'surprise',
#         }

# eid = 4
# featurepath = f'path/to/mmefeature/{eid2efeature_name[eid]}.pt'
# eid = torch.tensor([eid]).cuda().long()

# sid, speaker id, e.g. 3
sid = torch.tensor([3]).cuda().long()
# featurepath, path to emotion feature, e.g. EPAlign/test/implict_fused/fused_feature_name.pt
featurepath = 'path/to/emotion.pt'

efeature = torch.load(featurepath).cuda().float().unsqueeze(0)

In [None]:
sequence, sid, efeature.shape

#### Decode text input and plot results

In [None]:
if multi_speaker and multi_emotion and emotion_feature:
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, speaker_id=sid, emotion_feature=efeature)
elif multi_speaker and multi_emotion:
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, speaker_id=sid, emotion_id=eid)

plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

In [None]:
mel_outputs, mel_outputs_postnet

#### Synthesize audio from spectrogram using WaveGlow

In [None]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

In [None]:
import soundfile as sf

# save wavform speech to file
sf.write("speech.wav", audio[0].data.cpu().numpy(), hparams.sampling_rate)

#### (Optional) Remove WaveGlow bias

In [None]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 

## Generate speeches

In [None]:
from data_utils import TextMelLoader
from hparams import create_hparams

# define above at Setup hparams && Load model from checkpoint section
# multi_speaker = True
# multi_emotion = True
# emotion_feature = True

training_files = 'path/to/filelists'
validation_files = 'path/to/filelists'

hparams = create_hparams(None, 
                        is_multi_speaker=True, 
                        is_multi_emotion=True, 
                        is_emotion_feature=True,
                        training_files=training_files,
                        validation_files=validation_files,)

esd_en_dataset = TextMelLoader('path/to/filelists', hparams=hparams, is_return_path=True)
# return >> text, mel, sid, eid, efeature, audio_path
esd_en_dataset.__len__()

In [None]:
esd_en_dataset[0]

In [None]:
if multi_speaker and multi_emotion and emotion_feature:
    save_path = "save/path"
elif multi_speaker and multi_emotion:
    save_path = "save/path"

from tqdm import tqdm
with torch.no_grad():
    for i, items in tqdm(enumerate(esd_en_dataset)):
        # print(i, items)
        text, mel, sid, eid, efeature, audio_path = items
        audio_name = audio_path.split('/')[-1]
        text, sid, eid = text.cuda().long().unsqueeze(0), sid.cuda().long(), eid.cuda().long()
        if emotion_feature:
            efeature = efeature.cuda().float().unsqueeze(0)
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(text, speaker_id=sid, emotion_id=eid, emotion_feature=efeature)
        else:
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(text, speaker_id=sid, emotion_id=eid)
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
        sf.write(f"{save_path}/{audio_name}", audio[0].data.cpu().numpy(), hparams.sampling_rate)
        # break
    

In [None]:
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)