<a href="https://colab.research.google.com/github/JeanMichelDeveloper/Training/blob/main/Text_to_speech_with_PyTorch%2C_Tacotron_2_and_WaveGlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load model

In [5]:
#install unidecode
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 33.5 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


In [18]:
#import necessary libraries
import torch
from scipy.io.wavfile import write
from IPython.display import Audio

In [7]:
#load waveglow
cuda_is_available = torch.cuda.is_available()

device = torch.device("cuda" if cuda_is_available else "cpu")

def load_waveglow():
  waveglow = torch.hub.load("nvidia/DeepLearningExamples:torchhub",
                            "nvidia_waveglow")
  
  waveglow = waveglow.remove_weightnorm(waveglow)

  waveglow = waveglow.to(device)

  waveglow.eval()

  return waveglow

In [10]:
waveglow = load_waveglow()

Using cache found in /root/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub
Downloading checkpoint from https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_fp32/versions/19.09.0/files/nvidia_waveglowpyt_fp32_20190427


In [8]:
#load Tacotron
tacotron2 = torch.hub.load("nvidia/DeepLearningExamples:torchhub",
                           "nvidia_tacotron2",
                           model_math = "fp16")

tacotron2 = tacotron2.to(device)

tacotron2.eval()

Using cache found in /root/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub


Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (lin

convert text to speech

In [9]:
text = "This is a sample piece of text converted to speech by Waveglow and Tacotron2"

#prepare text for model
utils = torch.hub.load("nvidia/DeepLearningExamples:torchhub",
                       "nvidia_tts_utils")

sequences, lengths = utils.prepare_input_sequence([text])

Using cache found in /root/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub
  return s in _symbol_to_id and s is not '_' and s is not '~'
  return s in _symbol_to_id and s is not '_' and s is not '~'


In [12]:
#send text to model
with torch.no_grad():
  mel, _, _, = tacotron2.infer(sequences, lengths)

  audio = waveglow.infer(mel)

  print(audio)

tensor([[0.0005, 0.0005, 0.0001,  ..., 0.0002, 0.0006, 0.0006]],
       device='cuda:0')


In [15]:
#convert result to numpy array
audio_numpy = audio[0].data.cpu().numpy()

In [16]:
#create audio file
rate = 22050
write("textToSpeechWithTorch.wav", rate, audio_numpy)

In [19]:
#read audio file
Audio(audio_numpy, rate = rate)