# Speech Synthesis

In [80]:
# Taken from: https://github.com/dgaddy/silent_speech
import os
import json
import sys
import numpy as np

import torch

sys.path.append('./hifi_gan')
from env import AttrDict
from models import Generator

# from absl import flags
# FLAGS = flags.FLAGS
# flags.DEFINE_string('hifigan_checkpoint', None, 'filename of hifi-gan generator checkpoint')

class Vocoder(object):
    def __init__(self, hifigan_checkpoint=None, device='cuda'):
        assert hifigan_checkpoint is not None
        checkpoint_file = hifigan_checkpoint
        config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json')
        with open(config_file) as f:
            hparams = AttrDict(json.load(f))
        self.generator = Generator(hparams).to(device)
        self.generator.load_state_dict(torch.load(checkpoint_file)['generator'])
        self.generator.eval()
        self.generator.remove_weight_norm()

    def __call__(self, mel_spectrogram):
        '''
            mel_spectrogram should be a tensor of shape (seq_len, 80)
            returns 1d tensor of audio
        '''
        with torch.no_grad():
            mel_spectrogram = mel_spectrogram.T[np.newaxis,:,:]
            audio = self.generator(mel_spectrogram)
        return audio.squeeze()

In [81]:
vocoder = Vocoder("./hifigan_finetuned/checkpoint")

Removing weight norm...


## Visualise Mel Spectrogram

In [82]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def plot_mel_spectrograms(pred, text):
    fig, ax = plt.subplots(1) # nrows=1, ncols=2)

    # ax[0].set_title(f"Mel Spectogram (Predicted)")
    pred = np.swapaxes(pred, 0, 1)
    cax = ax.imshow(pred, interpolation='nearest', cmap=cm.coolwarm, origin='lower')

    ax.set_title(text)
    
    return fig, ax

## Synthesis Ground Truth JP Shorts Video

In [4]:
from lib import load_audio, get_audio_feats

test_audio_path = "./dataset/wsDmwoOrpR8/The False Appeal of Communism.mp3"

audio_arr   = load_audio(test_audio_path)
audio_feats = get_audio_feats(audio_arr, n_mel_channels=80)

  "class": algorithms.Blowfish,
 -4.07649505e-05  0.00000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  audio_features = librosa.feature.melspectrogram(


In [5]:
audio_feats.shape

(5136, 80)

In [6]:
# audio_feats = np.expand_dims(audio_feats, axis=0)

In [7]:
audio_feats = torch.tensor(audio_feats).to("cuda")

In [8]:
pred_audio = vocoder(audio_feats)

In [9]:
pred_audio.shape

torch.Size([1314816])

In [10]:
import soundfile as sf
sf.write('pred_output.wav', pred_audio.cpu().numpy(), 16_000)

## Synthesis Mel Spectrogram Prediction

In [114]:
# audio_feats = torch.load("./overfit_lecture_speech_features_melchannel80.pt").to("cpu")
#] audio_feats = torch.load("./general_lecture_speech_features_melchannel80.pt").to("cpu")
# audio_feats = torch.load("./validate_valid_lecture_speech_features_melchannel80.pt").to("cpu")
audio_feats = torch.load("./full_model_valid_mel_pred.pt").to("cpu")
# audio_feats = torch.load("./full_model_valid_mel_ground.pt").to("cpu")

In [115]:
_ = plot_mel_spectrograms(audio_feats, "")

In [116]:
print(audio_feats.device)

cpu


In [117]:
audio_feats = torch.tensor(audio_feats).to("cuda")

  audio_feats = torch.tensor(audio_feats).to("cuda")


In [118]:
print(audio_feats.shape)

torch.Size([2078, 80])


In [119]:
audio_feats_np = audio_feats.detach().cpu().numpy()
#fig, ax = plot_mel_spectrograms(audio_feats_np, "Pre-Interpolated Mel Spectrogram")
##plt.plot()

### Interpolate Predicted Mel Spectrogram

In [120]:
import numpy as np
from scipy import interpolate

def interpolate_spectrogram(spectrogram, original_hop_length, target_hop_length):
    original_time_steps, num_freqs = spectrogram.shape

    # Calculate the number of time steps for the target hop length
    target_time_steps = int(original_time_steps * (original_hop_length / target_hop_length))

    # Define a function for the interpolation
    x = np.arange(original_time_steps)
    f = interpolate.interp1d(x, spectrogram, axis=0, kind='linear')

    # Generate the new time steps
    x_new = np.linspace(0, original_time_steps-1, target_time_steps)

    # Apply the interpolation function
    new_spectrogram = f(x_new)

    return new_spectrogram

In [121]:
stretched_audio_feats = interpolate_spectrogram(
    audio_feats_np,
    original_hop_length=534,
    target_hop_length=int(534//2))
    # target_hop_length=160)

In [122]:
stretched_audio_feats.shape

(4156, 80)

In [123]:
final_audio_feats = torch.tensor(stretched_audio_feats).float().to("cuda")

In [124]:
final_audio_feats_10pc = final_audio_feats[:int(final_audio_feats.shape[0] * 0.10), :]

In [125]:
final_audio_feats_10pc.shape

torch.Size([415, 80])

In [126]:
final_audio_feats.shape

torch.Size([4156, 80])

In [127]:
pred_audio = vocoder(final_audio_feats_10pc)
# pred_audio = vocoder(final_audio_feats)

In [128]:
import soundfile as sf
sf.write('pred_output.wav', pred_audio.cpu().numpy(), 16_000)

## Speech Recognition

In [129]:
import whisper

model = whisper.load_model("base")
result = model.transcribe("pred_output.wav")
print(result["text"])

 you should quit πάocrons or the Street Times, so that all these Citocrons are changed. Well, I did quite a lot from your perspective.
