# English(LJSpeech-1.1)

- This tutorial covers how to use [VocGAN](https://arxiv.org/abs/2007.15256) released from [rishikksh20](https://github.com/rishikksh20/VocGAN).
- It is recommeded to download the pre-trained vocoder before training the non-attentive tacotron model to evaluate the model performance in training phrase.
- LJSpeech-1.1 dataset is also can be found in this [link](https://keithito.com/LJ-Speech-Dataset/)

In [38]:
from tacotron.vocgan_generator import Generator
import torch
import torchaudio
import os
import IPython.display as ipd

In [39]:
## insert generator path
#generator_path = '../checkpoints_g/vocgan_kss_pretrained_model_epoch_4500.pt'
generator_path = '../checkpoints_g/ljspeech_29de09d_4000.pt'

## init options of generator
sample_rate = 22050
n_fft = 1024
win_length = 1024
hop_length = 256
n_mels = 80
f_min=0.0
f_max=8000.0
power=1
norm='slaney'
mel_scale='slaney'

In [40]:
## set meta and audio data path of LJSpeech-1.1
data_path = '/code/gitRepo/data/LJSpeech-1.1/wavs'
script_path = '/code/gitRepo/data/LJSpeech-1.1/metadata.csv'

In [41]:
with open(script_path, 'r') as f:
    scripts = f.readlines()

In [48]:
script_index = 1
scripts[script_index].strip().split('|')
item = scripts[script_index].strip().split('|')
audio_path = os.path.join(data_path, item[0])
if Path(audio_path).suffix == '':
    audio_path = '{}.wav'.format(audio_path)
transcript = item[2]
print('[audio_path]', audio_path)
print('[transcript]', transcript)

[audio_path] /code/gitRepo/data/LJSpeech-1.1/wavs/LJ001-0002.wav
[transcript] in being comparatively modern.


In [49]:
## set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

generator = Generator(80, 4,
            ratios=[4, 4, 2, 2, 2, 2], mult=256,
            out_band=1)
generator.to(device)
generator_checkpoint = torch.load(generator_path)
generator.load_state_dict(generator_checkpoint['model_g'])
generator.eval()

In [50]:
## mel converter
mel_converter = torchaudio.transforms.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    n_mels=n_mels,
    f_min=f_min,
    f_max=f_max,
    normalized =False,
    power=power,
    norm=norm,
    mel_scale=mel_scale,
)

In [51]:
waveform, temp_sample_rate = torchaudio.load(audio_path)

## only use mono
if waveform.size(0) > 1:
    waveform = waveform[0, :].view(1, -1)

if temp_sample_rate != sample_rate:
    waveform = torchaudio.transforms.Resample(temp_sample_rate, sample_rate)(waveform)

melspec = mel_converter(waveform)
melspec = torch.log(melspec).to(device)

In [52]:
def vocgan_infer(mel, vocoder):
    model = vocoder

    with torch.no_grad():
        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)

        audio = model.infer(mel).squeeze()
        audio = 32768 * audio[:-(256*10)]
        audio = audio.clamp(min=-32768, max=32768-1)
        audio = audio.short().cpu().detach().numpy()
    return audio

In [53]:
## reconstruction
reconstructed_audio = vocgan_infer(melspec, generator)

In [54]:
## reconstructed audio
ipd.Audio(reconstructed_audio, rate=sample_rate)

In [55]:
## origianl audio
ipd.Audio(waveform, rate=sample_rate)

# Korean(KSS ver4)

- This tutorial covers how to use [VocGAN](https://arxiv.org/abs/2007.15256) released from [rishikksh20](https://github.com/rishikksh20/VocGAN).
- It is recommeded to download the pre-trained vocoder before training the non-attentive tacotron model to evaluate the model performance in training phrase.
- KSS dataset is also can be found in this [link](https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset)

In [56]:
from tacotron.vocgan_generator import Generator
import torch
import torchaudio
import os
import IPython.display as ipd
from pathlib import Path

In [57]:
## insert generator path
generator_path = '../checkpoints_g/vocgan_kss_pretrained_model_epoch_4500.pt'

## init options of generator
sample_rate = 22050
n_fft = 1024
win_length = 1024
hop_length = 256
n_mels = 80
f_min=0.0
f_max=8000.0
power=1
norm='slaney'
mel_scale='slaney'

In [58]:
## set meta and audio data path of KSS dataset
data_path = '/code/gitRepo/data/kss'
script_path = '/code/gitRepo/data/kss/transcript.v.1.4.txt'

In [59]:
with open(script_path, 'r') as f:
    scripts = f.readlines()

In [60]:
script_index = 1
scripts[script_index].strip().split('|')
item = scripts[script_index].strip().split('|')
audio_path = os.path.join(data_path, item[0])
if Path(audio_path).suffix == '':
    audio_path = '{}.wav'.format(audio_path)
transcript = item[2]
print('[audio_path]', audio_path)
print('[transcript]', transcript)

[audio_path] /code/gitRepo/data/kss/1/1_0001.wav
[transcript] 그녀의 사랑을 얻기 위해 애썼지만 헛수고였다.


In [61]:
## set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

generator = Generator(80, 4,
            ratios=[4, 4, 2, 2, 2, 2], mult=256,
            out_band=1)
generator.to(device)
generator_checkpoint = torch.load(generator_path)
generator.load_state_dict(generator_checkpoint['model_g'])
generator.eval()

In [62]:
waveform, temp_sample_rate = torchaudio.load(audio_path)

## only use mono
if waveform.size(0) > 1:
    waveform = waveform[0, :].view(1, -1)

## resample audio
if temp_sample_rate != sample_rate:
    waveform = torchaudio.transforms.Resample(temp_sample_rate, sample_rate)(waveform)

melspec = mel_converter(waveform)
melspec = torch.log(melspec).to(device)

In [63]:
def vocgan_infer(mel, vocoder):
    model = vocoder

    with torch.no_grad():
        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)

        audio = model.infer(mel).squeeze()
        audio = 32768 * audio[:-(256*10)]
        audio = audio.clamp(min=-32768, max=32768-1)
        audio = audio.short().cpu().detach().numpy()
    return audio

In [64]:
## reconstruction
reconstructed_audio = vocgan_infer(melspec, generator)

In [65]:
## reconstructed audio
ipd.Audio(reconstructed_audio, rate=sample_rate)

In [66]:
## origianl audio
ipd.Audio(waveform, rate=sample_rate)