In [22]:
import sys
import os
import torch
from unidecode import unidecode
import numpy as np
from pathlib import Path
from IPython.display import Audio

print(os.path.abspath(os.path.join('..', 'WaveRNN/models')))
sys.path.append(os.path.abspath(os.path.join('..', 'WaveRNN')))

from models.fatchord_version import WaveRNN
from models.tacotron import Tacotron
from utils.text.symbols import symbols
from utils.text import text_to_sequence
from utils import hparams as hp

d:\Code Files\Python\Data Science\Khmer-TTS\WaveRNN\models


In [23]:
if not hp.is_configured():
    hp.configure(os.path.abspath(os.path.join('..', 'WaveRNN/hparams.py')))

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Using device:', device)

print('\nInitialising WaveRNN Model...\n')

vocoder = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                    fc_dims=hp.voc_fc_dims,
                    bits=hp.bits,
                    pad=hp.voc_pad,
                    upsample_factors=hp.voc_upsample_factors,
                    feat_dims=hp.num_mels,
                    compute_dims=hp.voc_compute_dims,
                    res_out_dims=hp.voc_res_out_dims,
                    res_blocks=hp.voc_res_blocks,
                    hop_length=hp.hop_length,
                    sample_rate=hp.sample_rate,
                    mode=hp.voc_mode
                ).to(device)

# vocoder.load(os.path.abspath(os.path.join('..', 'WaveRNN/checkpoints/ljspeech_mol.wavernn/latest_weights.pyt')))
vocoder.load(os.path.abspath(os.path.join('..', 'WaveRNN/checkpoints/ljspeech_mol.wavernn/wave_step200k_weights.pyt')))

print('\nInitialising Tacotron Model...\n')

# Instantiate Tacotron Model
tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                        num_chars=len(symbols),
                        encoder_dims=hp.tts_encoder_dims,
                        decoder_dims=hp.tts_decoder_dims,
                        n_mels=hp.num_mels,
                        fft_bins=hp.num_mels,
                        postnet_dims=hp.tts_postnet_dims,
                        encoder_K=hp.tts_encoder_K,
                        lstm_dims=hp.tts_lstm_dims,
                        postnet_K=hp.tts_postnet_K,
                        num_highways=hp.tts_num_highways,
                        dropout=hp.tts_dropout,
                        stop_threshold=hp.tts_stop_threshold).to(device)

tts_model.load(os.path.abspath(os.path.join('..', 'WaveRNN/checkpoints/ljspeech_lsa_smooth_attention.tacotron/latest_weights.pyt')))
# tts_model.load(os.path.abspath(os.path.join('..', 'WaveRNN/checkpoints/ljspeech_lsa_smooth_attention.tacotron/taco_step20k_weights.pyt')))

Using device: cuda

Initialising WaveRNN Model...

Trainable Parameters: 4.234M

Initialising Tacotron Model...

Trainable Parameters: 11.088M


In [27]:
def synthesize_text(input_text):
    encoded_text = unidecode(input_text)

    inputs = [text_to_sequence(encoded_text.strip(), hp.tts_cleaner_names)]
    print(inputs)
    
    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x)
        # Fix mel spectrogram scaling to be from 0 to 1
        m = (m + 4) / 8
        np.clip(m, 0, 1, out=m)

        m = torch.tensor(m).unsqueeze(0)

        # Generate audio from mel spectrogram using WaveRNN
        output = vocoder.generate(m, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, hp.mu_law)

        return Audio(output.astype(np.float32), rate=hp.sample_rate)

    print('\n\nDone.\n')

In [28]:
input_text = "ព្រះ រាជាណាចក្រ កម្ពុជា"
synthesize_text(input_text)

[[39, 55, 45, 11, 55, 38, 38, 47, 38, 38, 51, 51, 38, 38, 40, 48, 55, 11, 48, 50, 39, 58, 47, 38, 38]]

| Generating 1/1
| ████████████████ 72000/72600 | Batch Size: 6 | Gen Rate: 2.9kHz | 

TypeError: No format specified and unable to get format from file extension: ''