<a href="https://colab.research.google.com/github/Laxmaan/Emotional-Speech-Synthesis/blob/main/Tacotron2_and_WaveNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tacotron2: WaveNet-basd text-to-speech demo

- Tacotron2 (mel-spectrogram prediction part): https://github.com/Rayhane-mamah/Tacotron-2
- WaveNet: https://github.com/r9y9/wavenet_vocoder

This is a proof of concept for Tacotron2 text-to-speech synthesis. Models used here were trained on [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).

**Notice**: The waveform generation is super slow since it implements naive autoregressive generation. It doesn't use parallel generation method described in [Parallel WaveNet](https://arxiv.org/abs/1711.10433). 

**Estimated time to complete**: 2 ~ 3 hours.

## Setup

### Install dependencies

In [None]:
%tensorflow_version 1.x

In [None]:
import os
import os.path as osp
from os.path import exists, join, expanduser
from google.colab import drive
import glob
from PIL import Image
import matplotlib.pyplot as plt
import librosa
drive.mount('/gdrive')
base_path = os.path.join('/gdrive','My Drive','IST597')
#base_path = expanduser('~')

In [None]:


os.chdir(base_path)

wavenet_dir = "wavenet_vocoder"
if not exists(wavenet_dir):
  ! git clone https://github.com/r9y9/wavenet_vocoder
  ! cd wavenet_vocoder && git checkout v0.1.1 && cd -
    
taco2_dir = "Tacotron-2"
if not exists(taco2_dir):
  ! git clone https://github.com/r9y9/$taco2_dir
  ! cd $taco2_dir && git checkout -B wavenet3 origin/wavenet3

In [None]:
# Install dependencies'''
! pip install -q -U "tensorflow<=1.9.0"
! pip install -q -U "numpy<1.16"
! pip install -q -U "pysptk<=0.1.14"
! pip install -q -U keras==2.2.4
os.chdir(join(base_path, taco2_dir))
! pip install -q -r requirements.txt

os.chdir(join(base_path, wavenet_dir))
! pip install -q -e '.[train]'

In [None]:
import torch
import tensorflow
import pysptk
import numpy as np
tensorflow.__version__, pysptk.__version__, np.__version__

### Download pretrained models

#### Tacotron2 (mel-spectrogram prediction part)

In [None]:
os.chdir(join(base_path, taco2_dir))
! mkdir -p logs-Tacotron
if not exists("logs-Tacotron/pretrained"):
  ! curl -O -L "https://www.dropbox.com/s/vx7y4qqs732sqgg/pretrained.tar.gz"
  ! tar xzvf pretrained.tar.gz
  ! mv pretrained logs-Tacotron

#### WaveNet

In [None]:
os.chdir(join(base_path, wavenet_dir))
wn_preset = "20180510_mixture_lj_checkpoint_step000320000_ema.json"
wn_checkpoint_path = "20180510_mixture_lj_checkpoint_step000320000_ema.pth"

if not exists(wn_preset):
  !curl -O -L "https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json"
if not exists(wn_checkpoint_path):
  !curl -O -L "https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth"

## Input texts to be synthesized

Choose your favorite sentences :)

In [None]:
os.chdir(join(base_path, taco2_dir))

In [None]:
%%bash
cat << EOS > text_list.txt
This is really awesome!
This is text-to-speech online demonstration by Tacotron 2 and WaveNet.
Thanks for your patience.
Will you desert me in the desert?
EOS

cat text_list.txt

# Choose Modes

In [None]:
make_mels = True
make_waveforms = True
process_ravdess = False

## Mel-spectrogram prediction by Tacoron2

In [None]:
def make_mel():
    # Remove old files if exist
    ! rm -rf tacotron_output
    ! python synthesize.py --model='Tacotron' --mode='eval' \
    --hparams='symmetric_mels=False,max_abs_value=4.0,power=1.1,outputs_per_step=16' \
    --text_list=./text_list.txt

if make_mels:
    make_mel()

# Preprocess RAVDESS AUDIO


In [None]:
if process_ravdess:
    import warnings
    warnings.filterwarnings('ignore')

    os.chdir(join(base_path, wavenet_dir))
    dataset_path = join(base_path,"VAE","data")

    x = glob.glob(dataset_path+"/**/*.wav")
    print(x)
    out_dir = join(base_path,"wavenet_processed_data")

    len(x)

In [None]:
if process_ravdess:
    !python preprocess_latest.py --num_workers=8 --preset=20180510_mixture_lj_checkpoint_step000320000_ema.json wavallin "$dataset_path" "$out_dir" 

### Convert mels to images

In [None]:
def get_mel_img(path):
    img = np.load(path)
    
    if img.shape[1] != 80:
      img = np.swapaxes(img, 0, 1)
    print(img.shape)
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
    img_int = np.interp(img,(0,4),(0,255))
    img_int = np.uint8(np.round(img_int))
    ax1.imshow(img)
    
    print(img_int.shape)
    ax2.imshow(img_int)
    
    Image.fromarray(img_int).save(osp.splitext(path)[0]+'.png')
    print(Image.fromarray(img_int).size)

In [None]:
def img_to_npy(path):
    img = Image.open(path).convert('L')
    img_arr = np.array(img)
    img_arr_new = np.interp(img_arr,(0,255),(0,1))
    print(img_arr.shape)
    print(img_arr_new.shape)
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
    
    ax1.imshow(img_arr)
    ax2.imshow(img_arr_new)
    np.save(osp.splitext(path)[0]+'.npy',img_arr_new)

In [None]:
mike_images = glob.glob(join(base_path,"mike_images/**/*.png"))
print(mike_images)
for img in mike_images:
    img_to_npy(img)

mike_mels = glob.glob(join(base_path,"mike_images/**/*.npy"))

## Waveform synthesis by WaveNet

In [None]:
import librosa.display
import IPython
from IPython.display import Audio
import numpy as np
import torch

In [None]:
os.chdir(join(base_path, wavenet_dir))

# Setup WaveNet vocoder hparams
from hparams import hparams
with open(wn_preset) as f:
    s = f.read()
    #print(s)
    hparams.parse_json(s)

# Setup WaveNet vocoder
from train import build_model
from synthesis import wavegen
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = build_model().to(device)

print("Load checkpoint from {}".format(wn_checkpoint_path))
checkpoint = torch.load(wn_checkpoint_path,map_location=device)
model.load_state_dict(checkpoint["state_dict"])

In [None]:

from tqdm import tqdm

In [None]:


with open("../Tacotron-2/tacotron_output/eval/map.txt") as f:
  maps = f.readlines()
maps = list(map(lambda x:x.strip().split("|"), maps))
# filter out invalid ones
maps = list(filter(lambda x:len(x) == 2, maps))

print("List of texts to be synthesized")
for idx, (text,_) in enumerate(maps):
  print(idx, text)

In [None]:
happy = [k for k in mike_mels if "happy" in k]
angry = [k for k in mike_mels if "angry" in k]
disgust = [k for k in mike_mels if "disgust" in k]

mike_mels = [val for tup in zip(happy,disgust,angry) for val in tup]

In [None]:
maps=[]
for mel in mike_mels:
    maps.append((osp.splitext(mel)[0],mel))

print(maps[:2])

### Waveform generation

**Note**: This will takes hours to finish depending on the number and lenght of texts. Try short sentences first if you would like to see samples quickly.

In [None]:
waveforms = []

for idx, (text, mel) in enumerate(maps):
  print("\n", idx, text)
  mel_path = mel#join("../Tacotron-2", mel)
  audio_pth = osp.splitext(mel_path)[0]+".wav"
  if not os.path.exists(audio_pth):
    c = np.load(mel_path)
    print(c.shape)
    if c.shape[1] != hparams.num_mels:
        c=np.swapaxes(c, 0, 1)
    print(c.max())
    # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]
    c = np.interp(c, (c.min(), c.max()), (0, 1))
    
    # Generate
    waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
    # Audio
    IPython.display.display(Audio(waveform, rate=hparams.sample_rate))
    waveforms.append(waveform)
    # Save
    audio_pth = osp.splitext(mel_path)[0]+".wav"
    librosa.output.write_wav(audio_pth, waveform, hparams.sample_rate)

 

In [None]:
 '''
 function ConnectButton(){
    console.log("Working"); 
    document.querySelector("#top-menubar").click() 
}
setInterval(ConnectButton,60000);
'''

## Summary: audio samples

In [None]:
for idx, (text, mel) in enumerate(maps):
  print(idx, text)
  IPython.display.display(Audio(waveforms[idx], rate=hparams.sample_rate))

# Save Audio

For more information, please visit https://github.com/r9y9/wavenet_vocoder. More samples can  be  found at https://r9y9.github.io/wavenet_vocoder/. 