**Vocoder Script**

In [None]:
!pip install numpy scipy librosa unidecode inflect librosa
!apt-get update
!apt-get install -y libsndfile1

In [None]:
import torch
import numpy as np
import librosa

waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32')

In [None]:
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()

waveglow.infer

In [None]:
!pip install speechbrain

In [None]:
from speechbrain.inference.vocoders import HIFIGAN

hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained_models/tts-hifigan-ljspeech")

In [4]:
def librosaWAV2MEL(path): # From a file path
  y, sr = librosa.load(path)
  MEL = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
  MEL= torch.from_numpy(MEL).to(device='cuda', dtype=torch.float32)
  return MEL

In [53]:
import torchaudio
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram

def HIFIWAV2MEL(path): # From a file path
  #signal, rate = torchaudio.load(path)
  # signal, rate = torchaudio.load('speechbrain/tts-hifigan-ljspeech/example.wav')

  signal, sr = librosa.load(path)
  signal = torch.tensor(signal)
  spectrogram, _ = mel_spectogram(
      audio=signal.squeeze(),
      sample_rate=22050,
      hop_length=256,
      win_length=None,
      n_mels=80,
      n_fft=1024,
      f_min=0.0,
      f_max=8000.0,
      power=1,
      normalized=False,
      min_max_energy_norm=True,
      norm="slaney",
      mel_scale="slaney",
      compression=True
  )

  return spectrogram

In [5]:
def readMEL(path):
    mel_np = np.load(path)
    mel = torch.from_numpy(mel_np)
    return mel

In [17]:
def waveglowMEL2WAV(MEL, sigma = 1):
  audio = waveglow.infer(MEL, sigma)
  y_hat = audio[0].data.cpu().numpy()
  return y_hat

In [18]:
def HIFIGANMEL2WAV(MEL):
  y_hat = hifi_gan.decode_batch(MEL)
  y_hat = y_hat.flatten().numpy()
  return y_hat

In [19]:
def griffinLimMEL2WAV(MEL, sr = 22050, n = 1024):
    if isinstance(MEL, torch.Tensor):
        MEL = MEL.detach().cpu().numpy()
        MEL = MEL.astype(np.float32)
    S = librosa.feature.inverse.mel_to_stft(MEL, sr=sr, n_fft=n)
    y_hat = librosa.istft(S, hop_length = 256, win_length=n)
    y_hat = y_hat.flatten()
    return y_hat

In [20]:
def boost(audio, gain_db):
    gain = 10**(gain_db/20)
    audio_boost = audio * gain

    return audio_boost

In [24]:
path = "mel1.npy"
MEL = readMEL(path)
MEL = MEL[:, :400] # Do not bust cuda memory

print(MEL.shape)
MEL = MEL.cuda().unsqueeze(0)

torch.Size([80, 400])


In [57]:
path_wav = "reference4.wav"
MEL2 = HIFIWAV2MEL(path_wav)

print(MEL2.shape)
MEL2 = MEL2.cuda().unsqueeze(0)

torch.Size([80, 211])


In [54]:
y_hat = waveglowMEL2WAV(MEL, 0.6)
y_hat2 = HIFIGANMEL2WAV(MEL)
print(y_hat.shape)
print(y_hat2.shape)

(102400,)
(104960,)


In [58]:
from scipy.io.wavfile import write
write("mel1test11sigma06.wav", 22050, y_hat)
write("mel1test1HIFI.wav", 22050, y_hat2)

y_hat3 = HIFIGANMEL2WAV(MEL2)
write("ref4reconstruction.wav", 22050, y_hat3)