<h1>Installation</h1>
pytorch-p38 image

In [None]:
!pip install -q espnet==0.10.6 
!echo 'espnet installed'
!pip install -q pypinyin 
!echo 'pypinyin install'
!pip install -qU parallel-wavegan
!echo 'PWGAN installed'

<h1>Do Imports</h1>

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
import IPython.display as ipd


In [1]:
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none

<h1>Load model artifacts using bash</h1>

In [2]:
%%bash


#artifacts structure :
# --------------------------------- Mel2text-----------------------------------------------
#folder with data for mel2text: it has to sub-folders:a folder that holds the name to the traning data and 
#a folder that holds the stats . we use stats of the training data
# 
#        mel2text_folder\
#                         tacotron_training_folder_name\
#                                     config.yml  # config file 
#                                     epochX.pth #name of the model
#                         tacotron_stats_folder_name\
#                                     train\
#                                         feats_stats.npz
# -------------------------------------------------------------------------------------------
# ----------------------------------vocoder--------------------------------------------------
#      vocoder_training_folder_name\
#                                   stats.h5
#                                   model.pth 
#                                   config.yml
# --------------------------------------------------------------------------------------------

language='German_MEL'


#s3 tacotron folder names


taco_source=s3://commonvoicesdataset/Text2Speech/GermanMale/German_Male_char_22050Hz
taco_root_fldr=exp
taco_train_fldr=tts_train_tacotron2_raw_char_tacotron
taco_stats_fldr=tts_stats_raw_char_tacotron
taco_stats_subf=train
mel2text_model=200epoch.pth 
mel2text_stats=feats_stats.npz
met2text_config=config.yaml

source_taco_model=${taco_source}/${taco_root_fldr}/${taco_train_fldr}/${mel2text_model}
source_taco_config=${taco_source}/${taco_root_fldr}/${taco_train_fldr}/${met2text_config}
source_taco_stats=${taco_source}/${taco_root_fldr}/${taco_stats_fldr}/${taco_stats_subf}/${mel2text_stats}

# echo "source taco path"
# echo ${source_taco_model}
# echo ${source_taco_config}
# echo ${source_taco_stats}

#s3 tacotron vocoder file names 

source_vocoder=s3://commonvoicesdataset/Text2Speech/GermanMale
    
vocoder_root_fldr=vocoder_MELGAN
vocoder_stats=stats.h5
vocoder_config=config.yml
vocoder_model=checkpoint-1000000steps.pkl

source_vocoder_model=${source_vocoder}/${vocoder_root_fldr}/${vocoder_model}
source_vocoder_stats=${source_vocoder}/${vocoder_root_fldr}/${vocoder_stats}
source_vocoder_config=${source_vocoder}/${vocoder_root_fldr}/${vocoder_config}

# echo "source vocoder path"
# echo ${source_vocoder_model}
# echo ${source_vocoder_stats}
# echo ${source_vocoder_config}


# create local folders 
language='German_MEL'

mel2txt_folder_name=exp 
vocoder_folder_name=vocoder
tacotron_training_folder=train 

taco_train_fldr=tts_train_raw_char_tacotron
taco_stats_fldr=tts_stats_raw_char_tacotron

root=/home/ec2-user/SageMaker/${language}_artifacts
vocoder=${root}/${vocoder_folder_name}

mel2txt=${root}/${mel2txt_folder_name}
exp_train=${mel2txt}/${taco_train_fldr}
exp_stats=${mel2txt}/${taco_stats_fldr}
exp_s_tr=${exp_stats}/train


[[ -d ${root} ]]          && echo "Path: ${root} exists nothing created"         || mkdir -v ${root} 
[[ -d ${vocoder} ]]       && echo "Path: ${vocoder} exists nothing created"      || mkdir -v ${vocoder}
[[ -d ${mel2txt} ]]       && echo "Path: ${mel2txt} exists nothing created"      || mkdir -v ${mel2txt}
[[ -d ${exp_train} ]]     && echo "Path: ${exp_train} exists nothing created"    || mkdir -v ${exp_train}
[[ -d ${exp_stats} ]]     && echo "Path: ${exp_stats} exists nothing created"    || mkdir -v ${exp_stats}
[[ -d ${exp_s_tr} ]]      && echo "Path: ${exp_s_tr} exists nothing created"     || mkdir -v ${exp_s_tr}

echo " "
echo " "
echo " "

# copy artifacts 

aws s3 cp ${source_taco_model} ${exp_train}
aws s3 cp ${source_taco_stats} ${exp_s_tr}
aws s3 cp ${source_taco_config} ${exp_train}

aws s3 cp ${source_vocoder_model} ${vocoder}
aws s3 cp ${source_vocoder_stats} ${vocoder}
aws s3 cp ${source_vocoder_config} ${vocoder}

#inference path 
inf_taco_model=${exp_train}/${mel2text_model}
inf_taco_config=${exp_train}/${met2text_config}
inf_taco_stats=${exp_s_tr}/${mel2text_stats}

inf_vocoder_model=${vocoder}/${vocoder_model}
inf_vocoder_stats=${vocoder}/${vocoder_stats}
inf_vocoder_config=${vocoder}/${vocoder_config}

info_path=${root}/inf_path.txt
[[ -f ${info_path} ]] && rm ${root}/inf_path.txt 
[[ ! -f ${info_path} ]] && touch ${root}/inf_path.txt
# echo "tacotron files"
printf "taco_model,${inf_taco_model}\n"   >> ${root}/inf_path.txt
printf "taco_config,${inf_taco_config}\n" >> ${root}/inf_path.txt
printf "taco_stats,${inf_taco_stats}\n"  >> ${root}/inf_path.txt


# echo "vocoder files"
printf "vocoder_model,${inf_vocoder_model}\n"   >> ${root}/inf_path.txt
printf "vocoder_stats,${inf_vocoder_stats}\n"   >> ${root}/inf_path.txt
printf "vocoder_config,${inf_vocoder_config}\n" >> ${root}/inf_path.txt


mkdir: created directory ‘/home/ec2-user/SageMaker/German_MEL_artifacts’
mkdir: created directory ‘/home/ec2-user/SageMaker/German_MEL_artifacts/vocoder’
mkdir: created directory ‘/home/ec2-user/SageMaker/German_MEL_artifacts/exp’
mkdir: created directory ‘/home/ec2-user/SageMaker/German_MEL_artifacts/exp/tts_train_raw_char_tacotron’
mkdir: created directory ‘/home/ec2-user/SageMaker/German_MEL_artifacts/exp/tts_stats_raw_char_tacotron’
mkdir: created directory ‘/home/ec2-user/SageMaker/German_MEL_artifacts/exp/tts_stats_raw_char_tacotron/train’
 
 
 
download: s3://commonvoicesdataset/Text2Speech/GermanMale/German_Male_char_22050Hz/exp/tts_train_tacotron2_raw_char_tacotron/200epoch.pth to German_MEL_artifacts/exp/tts_train_raw_char_tacotron/200epoch.pth
download: s3://commonvoicesdataset/Text2Speech/GermanMale/German_Male_char_22050Hz/exp/tts_stats_raw_char_tacotron/train/feats_stats.npz to German_MEL_artifacts/exp/tts_stats_raw_char_tacotron/train/feats_stats.npz
download: s3://commo

<h1>Load model</h1>

<h3> Using bash</h3>

In [5]:
#read the text file

path='/home/ec2-user/SageMaker/German_MEL_artifacts/inf_path.txt'
p={}
with open(path,mode='r',encoding='utf8',newline='\n') as file:    
    for idx,line in enumerate(file):        
         p[str(line.split(',')[0])]=line.split(',')[1].split('\n')[0]
#          print(line)


In [6]:
%cd '/home/ec2-user/SageMaker/German_MEL_artifacts/'

/home/ec2-user/SageMaker/German_MEL_artifacts


<h3>Inference with CPU</h3>

In [7]:
tts_GL = Text2Speech.from_pretrained(model_file=p['taco_model'])
tts_WGAN = Text2Speech.from_pretrained(model_file=p['taco_model'],vocoder_file=p['vocoder_model'])

In [8]:
#decode input 
text = 'In Afganestan ma ro gaeedi va in Afhanestan Bia ino bekhor'
wav = tts_WGAN(text)["wav"]
#plot wave
sr = 22050
# wav1 = wav.cpu().detach().numpy()
# plt.figure(figsize=(14, 5))
# librosa.display.waveshow(wav1, sr=22050)
#listen to results
ipd.Audio(wav, rate=sr) # load a NumPy array

In [115]:
import soundfile as sf
sf.write('results.wav',wav,sr)


<h3>Inference with GPU</h3>

In [None]:
text2speech = Text2Speech.from_pretrained(
    model_file=str_or_none(p['taco_model']),
    vocoder_file=str_or_none(p['vocoder_model']),
    device="cuda",
    # Only for Tacotron 2 & Transformer
    threshold=0.5,
    # Only for Tacotron 2
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=True,
    backward_window=1,
    forward_window=3,

)

In [None]:
import time
import torch


In [None]:
text= 'In Afganestan ma ro gaeedi va in Afhanestan Bia ino bekhor'
with torch.no_grad():
    start = time.time()
    wav = text2speech(text)["wav"]
rtf = (time.time() - start) / (len(wav) / text2speech.fs)
print(f"RTF = {rtf:5f}")


# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))

<h1>Inference on a number of data for evaluation </h1>

In [9]:
# wavs_folder_path = '/home/ec2-user/SageMaker/espnet/egs2/German_Male/tts1/wavs'
import os
import time
import soundfile as sf
import json
import shutil
import torch

result_folder = '/home/ec2-user/SageMaker/German_MEL_results'
result_batch = os.path.join(result_folder,'batch_result')

if os.path.exists(result_batch):
    shutil.rmtree(result_batch)
os.makedirs(result_batch)

In [12]:
%%writefile /home/ec2-user/SageMaker/German_MEL_results/samples.txt
German_TTS_000015|Es ist genau vierzehn Minuten vor neun!
German_TTS_000016|Beim ersten Einsatz bekommt sie beide Bälle.
German_TTS_000017|Finnland wirft nun alles nach vorne.

Writing /home/ec2-user/SageMaker/German_MEL_results/samples.txt


<h3>CPU inference</h3>

In [13]:
with open(os.path.join(result_folder,'samples.txt'),'r') as file:
    lines = file.readlines()

sample_num = []
wave_name  = []
text       = []
wave_path  = []
data_dict  = {}
rtf        = {}
for idx,line in enumerate(lines):
    print(idx)
    sample_num.append(line.split('|')[0])
    text = line.split('|')[1].split('\n')[0]
    with torch.no_grad():
        start = time.time()
        wav = tts_WGAN(text)["wav"]
        file_name = str(line.split('|')[0])+'.wav'
        sf.write(os.path.join(result_batch,file_name),wav,samplerate=22050)
        rtf[file_name]=(time.time() - start) / (len(wav) / tts_WGAN.fs)
#          print("file {} processed".format(idx))
with open(os.path.join(result_batch,'times.json'),'w') as file:
    json.dump(rtf,file)

0
1
2


<h3>GPU inference </h3>

In [None]:

with open(os.path.join(result_folder,'samples.txt'),'r') as file:
    lines = file.readlines()

sample_num = []
wave_name  = []
text       = []
wave_path  = []
data_dict  = {}
rtf        = {}
for idx,line in enumerate(lines):
#     print(idx)
    sample_num.append(line.split('|')[0])
    text = line.split('|')[1].split('\n')[0]
    with torch.no_grad():
        start = time.time()
        wav = text2speech(text)["wav"]
        file_name = str(line.split('|')[0])+'.wav'
        sf.write(os.path.join(result_batch,file_name),wav.view(-1).cpu().numpy(),samplerate=22050)
        rtf[file_name]=(time.time() - start) / (len(wav) / text2speech.fs)
#          print("file {} processed".format(idx))
with open(os.path.join(result_batch,'times.json'),'w') as file:
    json.dump(rtf,file)