## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install jamo
!pip install unidecode
!pip install pillow
!pip install librosa
!pip install matplotlib

In [None]:
%cd /content/drive/MyDrive/머신러닝 Teamproj/tacotron2-master # 해당 코드 파일로 경로 지정정
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import defaults
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

In [None]:
def load_model(hparams):
    model = Tacotron2(hparams).cuda()
    if hparams.fp16_run:
        model.decoder.attention_layer.score_mask_value = finfo('float16').min

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    return model

In [None]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none')

In [None]:
class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

#### Setup hparams

In [None]:
hparams = Struct(**defaults)
hparams.n_mel_channels=80
hparams.sampling_rate =22050

#### Load model from checkpoint

In [None]:
tacotron_check= "/content/drive/MyDrive/머신러닝 Teamproj/checkpoint_10000"
model = load_model(hparams)
model.load_state_dict(torch.load(tacotron_check)['state_dict'])
model.cuda().eval()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [None]:
!git clone https://github.com/NVIDIA/waveglow.git
!cd waveglow
!git submodule init
!git submodule update

In [None]:
%cd /content/drive/MyDrive/머신러닝 Teamproj/tacotron2-master/waveglow
waveglow_path = '/content/drive/MyDrive/머신러닝 Teamproj/tacotron2-master/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

#### Prepare text input

In [None]:
text = "I can do this all day."
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] # english model은 'english_cleaners', korean model은 'korean_cleaners'를 사용
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()
print(sequence)

#### Decode text input and plot results

In [None]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0]))

#### Synthesize audio from spectrogram using WaveGlow

In [None]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666).cuda()
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

#### (Optional) Remove WaveGlow bias

In [None]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 

####Show spectrogram(dB) of the wav data

In [None]:
import numpy as np
import librosa, librosa.display 
import matplotlib.pyplot as plt

FIG_SIZE = (15,10)

file = "/content/drive/MyDrive/ML2team3/code/tacotron2/kss_temp/1_0000.wav" #wav 데이터 파일 경로 지정

# load audio file with Librosa
sig, sr = librosa.load(file, sr=22050)

fft = np.fft.fft(sig)

# 복소공간 값 절댓갑 취해서, magnitude 구하기
magnitude = np.abs(fft) 

# Frequency 값 만들기
f = np.linspace(0,sr,len(magnitude))

# 푸리에 변환을 통과한 specturm은 대칭구조로 나와서 high frequency 부분 절반을 날려고 앞쪽 절반만 사용한다.
left_spectrum = magnitude[:int(len(magnitude)/2)]
left_f = f[:int(len(magnitude)/2)]

# STFT -> spectrogram
hop_length = 512  # 전체 frame 수
n_fft = 2048  # frame 하나당 sample 수

# calculate duration hop length and window in seconds
hop_length_duration = float(hop_length)/sr
n_fft_duration = float(n_fft)/sr

# STFT
stft = librosa.stft(sig, n_fft=n_fft, hop_length=hop_length)

# 복소공간 값 절댓값 취하기
magnitude = np.abs(stft)

# magnitude > Decibels 
log_spectrogram = librosa.amplitude_to_db(magnitude)

# display spectrogram
plt.figure(figsize=FIG_SIZE)
librosa.display.specshow(log_spectrogram, sr=sr, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar(format="%+2.0f dB")
plt.title("Spectrogram (dB)")