# Setup Code

In [None]:
# Only needed for setup
# Creates directory for experiment
%cd '/content/drive/'
import os
if not os.path.exists('My Drive/disso/'):
    os.makedirs('My Drive/disso/')
else:
    print("\nDirectory already exists")

In [None]:
# Only need to run for setup
# Clones tacotron2 repository
%cd '/content/drive/My Drive/disso/'
!git clone https://github.com/NVIDIA/tacotron2.git

%cd '/content/drive/My Drive/disso/tacotron2'
!git fetch --all
!git reset --hard origin/master

# installs WaveGlow
!git submodule init
!git submodule update

In [None]:
# Only need to run for setup
# verifies and updates WaveGlow
%cd "/content/drive/My Drive/disso/tacotron2/waveglow"
!git submodule init
!git submodule update
!git fetch --all
!git reset --hard origin/master

# Runtime Code

In [None]:
!nvidia-smi -L
#P100 or V100 needed
#T4, P4, k80 slow

In [2]:
# # only needed for gpu stats
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
# !pip install gputil
# !pip install psutil
# !pip install humanize

In [3]:
# # GPU stats
# import psutil
# import humanize
# import os
# import GPUtil as GPU
# GPUs = GPU.getGPUs()
# # XXX: only one GPU on Colab and isn’t guaranteed
# gpu = GPUs[0]
# def printm():
#  process = psutil.Process(os.getpid())
#  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
#  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
# printm()

In [None]:
# Execute before each run time
# Installs requirements
%cd "/content/drive/My Drive/disso/tacotron2"
!apt-get install sox
!pip install -r requirements.txt

In [None]:
!pip install numba==0.48

# tacotron2 training

In [None]:
# Code for training models
%cd "/content/drive/My Drive/disso/tacotron2"
!python train.py --output_directory=outdir --log_directory=logdir -c outdir/checkpoint_8000 --warm_start
# for starting from a checkpoint, -c 'chk_point' --warm_start
# change directory of checkpoint as required

37k+ warm start cori, 18k+ phil

# tensorboard for monitoring training

In [None]:
# Lauches a tensorboard to view training progress
%load_ext tensorboard
import tensorflow as tf
import datetime, os
%cd "/content/drive/My Drive/disso/tacotron2"
%tensorboard --logdir=outdir/logdir
# Change directory of logs as required

# audio files processing

In [None]:
# #run this cell to extract tar files
# #code to unzip your tar audio files, change directories as required
# !tar -xvf "/content/drive/My Drive/files/audio.tar" -C "/content/drive/My Drive/test/"    

In [None]:
pip install wavio

In [None]:
# creates list of audio files to convert sample rate
# repeat for all directories containing audio files
# change directories as required
import os
wav_list=[]
for root,dirs,files in os.walk('/content/drive/My Drive/disso/tacotron2/audio/6097_clean/15326/'):
  for file in files:
    # change .flac to .wav for wav files with different sr
    if file.endswith('.flac'):
      wav_list.append(file)
#print(len(wav_list))

In [None]:
# only used initially to correct the sample rate of files to example sr
# change directories as required

%cd "/content/drive/My Drive/disso/tacotron2/audio/6097_clean/15326/"
import wavio
import soundfile as sf
import librosa
from tqdm import tqdm   
for i in tqdm(wav_list):
  y, s = librosa.load(i, sr=22050)
  wavio.write('/content/drive/My Drive/disso/tacotron2/audio/6097_clean/15326/'+i[:-5]+'.wav', y, 22050,sampwidth=2)
  #sf.write('/content/drive/My Drive/disso/tacotron2/audio/6097_clean/9575/'+i[:-5]+'.wav', y, 22050)
# Use either wavio or sf depending on system limitations

In [None]:
# #removes unnecessary files, only required if the source file is non-wav
# %cd "/content/drive/My Drive/disso/tacotron2/audio/6097_clean/15326"
# for i in tqdm(wav_list):
#   %rm "$i"

# padding

In [None]:
!sudo apt-get install sox

In [None]:
# #change directory here as required
%cd "/content/drive/My Drive/disso/tacotron2/audio/6097_clean/15326"

In [None]:
# batch process to remove silence from the start of files and to add silences at the end for padding
%%bash

for file in *.wav; do
    cp "$file" "tmp.wav";
    sox -q "tmp.wav" "$file" pad .2 .2 silence 1 0.1 0.1% reverse silence 1 0.1 0.1% reverse;
done

In [None]:
%rm tmp.wav

# Audio synthesis test

In [None]:
# Only required if a unidecode error appears
!pip install unidecode

#### Import libraries and setup matplotlib

In [2]:
%cd "/content/drive/My Drive/disso/tacotron2"

import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
%cd "/content/drive/My Drive/disso/tacotron2/waveglow"
from denoiser import Denoiser

/content/drive/My Drive/disso/tacotron2
/content/drive/My Drive/disso/tacotron2/waveglow


In [3]:
# plot function to view mel outputs
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none')

#### Setup hparams

In [None]:
# initiates hyper parameters
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [None]:
# chage path to the desired checkpoint
%cd "/content/drive/My Drive/disso/tacotron2/outdir"
checkpoint_path = 'warm_start/10000'
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [None]:
# loads and primes WaveGlow model
%cd "/content/drive/My Drive/disso/tacotron2"
waveglow_path = '/content/drive/My Drive/Disso_package/files/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

#### Prepare text input

In [19]:
# Input text to be vocalised into audio
# Change as required
text = "HELLO! I WAS CREATED TO TEST THE EFFECT OF TRANSFER LEARNING ON AUTOMATED AUDIO SYNTHESIS. CHANGE THIS TEXT TO TEST ME"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

#### Decode text input and plot results

In [20]:
# Displays mel spectrum outputs and alignment
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

#### Synthesize audio from spectrogram using WaveGlow

In [None]:
# Produces raw audio
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=1)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

#### (Optional) Remove WaveGlow bias

In [None]:
# Denoises the raw audio
audio_denoised = denoiser(audio, strength=0.005)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 