# Transformer TTS: A Text-to-Speech Transformer in TensorFlow 2
## Autoregressive Model

In [1]:
# Clone the Transformer TTS and WaveRNN repos
!git clone https://github.com/as-ideas/TransformerTTS.git
!git clone https://github.com/fatchord/WaveRNN

Cloning into 'TransformerTTS'...
remote: Enumerating objects: 49, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 2451 (delta 19), reused 25 (delta 9), pack-reused 2402[K
Receiving objects: 100% (2451/2451), 4.24 MiB | 3.35 MiB/s, done.
Resolving deltas: 100% (1656/1656), done.
Cloning into 'WaveRNN'...
remote: Enumerating objects: 928, done.[K
remote: Total 928 (delta 0), reused 0 (delta 0), pack-reused 928[K
Receiving objects: 100% (928/928), 241.65 MiB | 6.76 MiB/s, done.
Resolving deltas: 100% (540/540), done.


In [2]:
# Install requirements
!apt-get install -y espeak
!pip install -r TransformerTTS/requirements.txt

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?




In [3]:
# Download the pre-trained weights
! wget https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/TransformerTTS/ljspeech_autoregressive_transformer.zip
! unzip ljspeech_autoregressive_transformer.zip

--2020-06-04 00:15:12--  https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/TransformerTTS/ljspeech_autoregressive_transformer.zip
Resolving public-asai-dl-models.s3.eu-central-1.amazonaws.com (public-asai-dl-models.s3.eu-central-1.amazonaws.com)... 52.219.73.89
Connecting to public-asai-dl-models.s3.eu-central-1.amazonaws.com (public-asai-dl-models.s3.eu-central-1.amazonaws.com)|52.219.73.89|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 177657069 (169M) [application/zip]
Saving to: ‘ljspeech_autoregressive_transformer.zip’


2020-06-04 00:15:43 (5.48 MB/s) - ‘ljspeech_autoregressive_transformer.zip’ saved [177657069/177657069]

Archive:  ljspeech_autoregressive_transformer.zip
   creating: ljspeech_autoregressive_transformer/
  inflating: __MACOSX/._ljspeech_autoregressive_transformer  
  inflating: ljspeech_autoregressive_transformer/.DS_Store  
  inflating: __MACOSX/ljspeech_autoregressive_transformer/._.DS_Store  
   creating: ljspeech_autoregre

In [1]:
# Set up the paths
from pathlib import Path
WaveRNN_path = 'WaveRNN/'
TTS_path = 'TransformerTTS/'
config_path = Path('ljspeech_autoregressive_transformer/standard')

import sys
sys.path.append(TTS_path)

In [2]:
# Load pretrained models
from utils.config_manager import ConfigManager
from utils.audio import reconstruct_waveform

import IPython.display as ipd

config_loader = ConfigManager(str(config_path), model_kind='autoregressive')
model = config_loader.load_model(str(config_path / 'autoregressive_weights/ckpt-40'))

Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


restored weights from ljspeech_autoregressive_transformer/standard/autoregressive_weights/ckpt-40 at step 400000


In [11]:
# Synthesize text
sentence = 'Marla philosophy of life is that she might die at any moment. The tragedy, she said, was that she didnt.'
out = model.predict(sentence)

pred text mel: 701 stop out: -6.156545639038086Stopping


In [12]:
# Convert spectrogram to wav (with griffin lim)
wav = reconstruct_waveform(out['mel'].numpy().T, config=config_loader.config)
print('rate', config_loader.config['sampling_rate'])
ipd.display(ipd.Audio(wav, rate=config_loader.config['sampling_rate']))

rate 22050


In [13]:
# Normalize for WaveRNN
mel = (out['mel'].numpy().T+4.)/8.

### WaveRNN

In [14]:
!pip install torch==1.5.0

Collecting torch==1.5.0
  Using cached torch-1.5.0-cp36-cp36m-manylinux1_x86_64.whl (752.0 MB)
Processing /home/malchul/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e/future-0.18.2-cp36-none-any.whl
Installing collected packages: future, torch
Successfully installed future-0.18.2 torch-1.5.0


In [14]:
# Do some sys cleaning and imports
sys.path.remove(TTS_path)
sys.modules.pop('utils')

ValueError: list.remove(x): x not in list

In [15]:
sys.path.append(WaveRNN_path)
from utils.dsp import hp
from models.fatchord_version import WaveRNN
import torch
import numpy as np
WaveRNN_path = Path(WaveRNN_path)

In [8]:
# Unzip the pretrained model
!unzip WaveRNN/pretrained/ljspeech.wavernn.mol.800k.zip -d WaveRNN/pretrained/

Archive:  WaveRNN/pretrained/ljspeech.wavernn.mol.800k.zip
replace WaveRNN/pretrained/latest_weights.pyt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [16]:
# Load pretrained model
hp.configure(WaveRNN_path / 'hparams.py')  # Load hparams from file
device = torch.device('cpu')
model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                fc_dims=hp.voc_fc_dims,
                bits=hp.bits,
                pad=hp.voc_pad,
                upsample_factors=hp.voc_upsample_factors,
                feat_dims=hp.num_mels,
                compute_dims=hp.voc_compute_dims,
                res_out_dims=hp.voc_res_out_dims,
                res_blocks=hp.voc_res_blocks,
                hop_length=hp.hop_length,
                sample_rate=hp.sample_rate,
                mode=hp.voc_mode).to(device)

model.load(str(WaveRNN_path / 'pretrained/latest_weights.pyt'))

Trainable Parameters: 4.234M


In [17]:
# Ignore some TF warnings
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [18]:
# Generate sample with pre-trained WaveRNN vocoder
batch_pred = True # False is slower but possibly better
_ = model.generate(mel.clip(0,1)[np.newaxis,:,:], 'scientists.wav', batch_pred, 11_000, hp.voc_overlap, hp.mu_law)

| ████████████████ 204000/205700 | Batch Size: 17 | Gen Rate: 17.8kHz | 

In [19]:
# Load wav file
ipd.display(ipd.Audio('scientists.wav'))