# English
- This tutorial covers how to use trained non-attentive tacotron for pace control.

In [None]:
from tacotron.vocgan_generator import Generator
import torch
import torchaudio
import os
import IPython.display as ipd
from tacotron.model import NonAttentiveTacotron
from tacotron.tokenizer import BaseTokenizer

In [None]:
## set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
## non-attentive tacotron model path
tacotron_path = '../results/kss_4/model/50000'

## build tcotron
tacotron = NonAttentiveTacotron.from_pretrained(tacotron_path)
tacotron.to(device)
tacotron.eval()

## build tokenizer
tokenizer = BaseTokenizer.from_pretrained(tacotron_path)

# Korean
- This tutorial covers how to use trained non-attentive tacotron for pace control.

In [1]:
from tacotron.vocgan_generator import Generator
import torch
import torchaudio
import os
import IPython.display as ipd
from tacotron.model import NonAttentiveTacotron
from tacotron.tokenizer import BaseTokenizer

In [2]:
## set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load Non-attentive tacotron & tokenizer

In [3]:
## non-attentive tacotron model path
tacotron_path = '../results/kss_8/model/40000'

## build tcotron
tacotron = NonAttentiveTacotron.from_pretrained(tacotron_path)
tacotron.to(device)
tacotron.eval()

## build tokenizer
tokenizer = BaseTokenizer.from_pretrained(tacotron_path)

can't use hydra lib. the path will be [../results/kss_8/model/40000]
can't use hydra lib. the path will be [../results/kss_8/model/40000]


### Load Generator

In [4]:
## generator path
generator_path = '../checkpoints_g/vocgan_kss_pretrained_model_epoch_4500.pt'

## build generator
generator = Generator(80, 4,
            ratios=[4, 4, 2, 2, 2, 2], mult=256,
            out_band=1)
generator_checkpoint = torch.load(generator_path)
generator.load_state_dict(generator_checkpoint['model_g'])
generator.to(device)
generator.eval()

### Generate Audio

In [10]:
sample_text = '이 타코트론은 정말 잘 작동한다.'

In [11]:
## encode text
encoded_text = tokenizer.encode(sample_text)
print(encoded_text)

{'input_ids': [8, 15, 1, 49, 3, 44, 14, 49, 9, 17, 14, 7, 8, 9, 7, 1, 31, 23, 12, 18, 3, 21, 1, 31, 3, 21, 1, 31, 3, 19, 34, 14, 12, 35, 3, 7, 34, 3], 'special_input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'pace_input_ids': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
## generate log mel-spectrogram
with torch.no_grad():
    encoded_torch_text = {key: torch.tensor(item, dtype=torch.long).unsqueeze(0).to(device) for key, item in encoded_text.items()}
    tacotron_output = tacotron.inference(**encoded_torch_text)

In [13]:
with torch.no_grad():
    audio = generator.generate_audio(**tacotron_output)

In [14]:
## origianl audio
ipd.Audio(audio, rate=tacotron.sampling_rate)

### Generate Audio - pace control

In [15]:
pace_text = {
    '이 ' : 2,
    '타코트론' : 1,
    '은 정말 잘 작동한다.' : 1
}

In [16]:
## encode text
encoded_text = tokenizer.encode_pace(pace_text)
print(encoded_text)

{'input_ids': [8, 15, 1, 49, 3, 44, 14, 49, 9, 17, 14, 7, 8, 9, 7, 1, 31, 23, 12, 18, 3, 21, 1, 31, 3, 21, 1, 31, 3, 19, 34, 14, 12, 35, 3, 7, 34, 3], 'special_input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'pace_input_ids': [2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [17]:
## generate log mel-spectrogram
with torch.no_grad():
    encoded_torch_text = {key: torch.tensor(item, dtype=torch.long).unsqueeze(0).to(device) for key, item in encoded_text.items()}
    tacotron_output = tacotron.inference(**encoded_torch_text)

In [18]:
with torch.no_grad():
    audio = generator.generate_audio(**tacotron_output)

In [19]:
## origianl audio
ipd.Audio(audio, rate=tacotron.sampling_rate)

In [37]:
pace_text = {
    '이 ' : 1,
    '타코' : 2,
    '트론은 정말 잘 작동한다.' : 1
}
## encode text
encoded_text = tokenizer.encode_pace(pace_text)
print(encoded_text)
## generate log mel-spectrogram
with torch.no_grad():
    encoded_torch_text = {key: torch.tensor(item, dtype=torch.long).unsqueeze(0).to(device) for key, item in encoded_text.items()}
    tacotron_output = tacotron.inference(**encoded_torch_text)
    audio = generator.generate_audio(**tacotron_output)
## origianl audio
ipd.Audio(audio, rate=tacotron.sampling_rate)

{'input_ids': [8, 15, 1, 49, 3, 44, 14, 49, 9, 17, 14, 7, 8, 9, 7, 1, 31, 23, 12, 18, 3, 21, 1, 31, 3, 21, 1, 31, 3, 19, 34, 14, 12, 35, 3, 7, 34, 3], 'special_input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'pace_input_ids': [1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [32]:
pace_text = {
    '이 ' : 1,
    '타코트론' : 1,
    '은 정말 ' : 1,
    '잘' : 3,
    ' 작동한다.':1,
}
## encode text
encoded_text = tokenizer.encode_pace(pace_text)
print(encoded_text)
## generate log mel-spectrogram
with torch.no_grad():
    encoded_torch_text = {key: torch.tensor(item, dtype=torch.long).unsqueeze(0).to(device) for key, item in encoded_text.items()}
    tacotron_output = tacotron.inference(**encoded_torch_text)
    audio = generator.generate_audio(**tacotron_output)
## origianl audio
ipd.Audio(audio, rate=tacotron.sampling_rate)

{'input_ids': [8, 15, 1, 49, 3, 44, 14, 49, 9, 17, 14, 7, 8, 9, 7, 1, 31, 23, 12, 18, 3, 21, 1, 31, 3, 21, 1, 31, 3, 19, 34, 14, 12, 35, 3, 7, 34, 3], 'special_input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'pace_input_ids': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
