# Transformer TTS: A Text-to-Speech Transformer in TensorFlow 2
## Forward Model

In [None]:
# Clone the Transformer TTS and WaveRNN repos
!git clone https://github.com/as-ideas/TransformerTTS.git
!git clone https://github.com/fatchord/WaveRNN

In [None]:
# Install requirements
!apt-get install -y espeak
!pip install -r TransformerTTS/requirements.txt

In [3]:
# Download the pre-trained weights
! wget https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/TransformerTTS/ljspeech_forward_transformer.zip
! unzip ljspeech_forward_transformer.zip

--2020-06-09 12:23:01--  https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/TransformerTTS/ljspeech_forward_transformer.zip
Resolving public-asai-dl-models.s3.eu-central-1.amazonaws.com (public-asai-dl-models.s3.eu-central-1.amazonaws.com)... 52.219.74.81
Connecting to public-asai-dl-models.s3.eu-central-1.amazonaws.com (public-asai-dl-models.s3.eu-central-1.amazonaws.com)|52.219.74.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 210039236 (200M) [application/zip]
Saving to: ‘ljspeech_forward_transformer.zip’


2020-06-09 12:23:35 (6.20 MB/s) - ‘ljspeech_forward_transformer.zip’ saved [210039236/210039236]

Archive:  ljspeech_forward_transformer.zip
   creating: ljspeech_forward_transformer/
  inflating: __MACOSX/._ljspeech_forward_transformer  
  inflating: ljspeech_forward_transformer/.DS_Store  
  inflating: __MACOSX/ljspeech_forward_transformer/._.DS_Store  
   creating: ljspeech_forward_transformer/standard/
  inflating: __MACOSX/ljspeech_forw

In [4]:
# Set up the paths
from pathlib import Path
WaveRNN_path = 'WaveRNN/'
TTS_path = 'TransformerTTS/'
config_path = Path('ljspeech_forward_transformer/standard')

import sys
sys.path.append(TTS_path)

In [5]:
# Load pretrained models
from utils.config_manager import ConfigManager
from utils.audio import reconstruct_waveform

import IPython.display as ipd

config_loader = ConfigManager(str(config_path), model_kind='forward')
model = config_loader.load_model(str(config_path / 'forward_weights/ckpt-133'))

restored weights from ljspeech_forward_transformer/standard/forward_weights/ckpt-133 at step 665000


In [11]:
# Synthesize text
sentence =  'Marla philosophy of life is that she might die at any moment. The tragedy, she said, was that she didnt.'
out = model.predict(sentence)

In [12]:
# Convert spectrogram to wav (with griffin lim)
wav = reconstruct_waveform(out['mel'].numpy().T, config=config_loader.config)
ipd.display(ipd.Audio(wav, rate=config_loader.config['sampling_rate']))

In [13]:
# Normalize for WaveRNN
mel = (out['mel'].numpy().T+4.)/8.

You can also vary the speech speed

In [15]:
# 20% faster
sentence = 'Marla philosophy of life is that she might die at any moment. The tragedy, she said, was that she didnt.'
out = model.predict(sentence, speed_regulator=1.20)
wav = reconstruct_waveform(out['mel'].numpy().T, config=config_loader.config)
ipd.display(ipd.Audio(wav, rate=config_loader.config['sampling_rate']))

In [16]:
# 10% slower
sentence =  'Marla philosophy of life is that she might die at any moment. The tragedy, she said, was that she didnt.'

out = model.predict(sentence, speed_regulator=.9)
wav = reconstruct_waveform(out['mel'].numpy().T, config=config_loader.config)
ipd.display(ipd.Audio(wav, rate=config_loader.config['sampling_rate']))

### WaveRNN

In [19]:
# Do some sys cleaning and imports
sys.path.remove(TTS_path)
sys.modules.pop('utils')

<module 'utils' from 'TransformerTTS/utils/__init__.py'>

In [20]:
sys.path.append(WaveRNN_path)
from utils.dsp import hp
from models.fatchord_version import WaveRNN
import torch
import numpy as np
WaveRNN_path = Path(WaveRNN_path)

In [21]:
# Unzip the pretrained model
!unzip WaveRNN/pretrained/ljspeech.wavernn.mol.800k.zip -d WaveRNN/pretrained/

Archive:  WaveRNN/pretrained/ljspeech.wavernn.mol.800k.zip
replace WaveRNN/pretrained/latest_weights.pyt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [22]:
# Load pretrained model
hp.configure(WaveRNN_path / 'hparams.py')  # Load hparams from file
device = torch.device('cpu')
model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                fc_dims=hp.voc_fc_dims,
                bits=hp.bits,
                pad=hp.voc_pad,
                upsample_factors=hp.voc_upsample_factors,
                feat_dims=hp.num_mels,
                compute_dims=hp.voc_compute_dims,
                res_out_dims=hp.voc_res_out_dims,
                res_blocks=hp.voc_res_blocks,
                hop_length=hp.hop_length,
                sample_rate=hp.sample_rate,
                mode=hp.voc_mode).to(device)

model.load(str(WaveRNN_path / 'pretrained/latest_weights.pyt'))

Trainable Parameters: 4.234M


In [23]:
# Ignore some TF warnings
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [24]:
# Generate sample with pre-trained WaveRNN vocoder
batch_pred = True # False is slower but possibly better
_ = model.generate(mel.clip(0,1)[np.newaxis,:,:], 'scientists.wav', batch_pred, 11_000, hp.voc_overlap, hp.mu_law)

| ████████████████ 180000/181500 | Batch Size: 15 | Gen Rate: 16.6kHz | 

In [25]:
# Load wav file
ipd.display(ipd.Audio('scientists.wav'))