In [1]:
import torch
from tacotron import Tacotron
from pathlib import Path
from typing import Union, List
import numpy as np
import librosa
from synthesizer import Synthesizer
from speech_encoder import SpeechEncoder
from speech_encoder_v2 import SpeechEncoderV2
from data_preprocessing import *
import torchaudio
from embed import Embed

In [2]:

wav, sample_rate = torchaudio.load("D:\CODING\SpeechEncoder\data\LibriSpeech/train-clean-100/2764/36616/2764-36616-0000.flac")
wav = preprocess_audio(wav, sample_rate)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_device = torch.device("cpu")

# encoder = SpeechEncoder(device, loss_device)
encoder = SpeechEncoderV2(device, device)

# checkpoints = torch.load("models\speech_encoder_lstm\encoder.pt")
checkpoints = torch.load("models\speech_encoder_transformer\encoder(0.096).pt")

encoder.load_state_dict(checkpoints['model_state'])
embedder = Embed(encoder)



In [None]:
embedding, partial_embeds, _ = embedder.embed_utterance(wav, return_partials=True)
# embedding = np.expand_dims(embedding, 0)
text = "Last weekend, I went to the zoo with my family. We saw lions, elephants, and monkeys. The birds were colorful and sang beautiful songs. It was exciting to see so many animals in one place.".split("\n")
embeddings = [embedding] * len(text)

In [None]:
synthesizer_model_path = Path("models/synthesizer/synthesizer.pt")
synthesizer = Synthesizer(synthesizer_model_path, embeddings)

synthesizer.load()
specs = synthesizer.synthesize_spectrograms(text)
spec = np.concatenate(specs, axis=1)

breaks = [spec.shape[1] for spec in specs]

## Generating WAV using the Griffiin Lim Algorithm

In [None]:
wav_og = synthesizer.griffin_lim(spec)

In [None]:
b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [wav_og[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

In [None]:
# wav_og = preprocess_audio(wav_og, Synthesizer.sample_rate)
wav_processed = wav_og / np.abs(wav_og).max() * 0.97

import IPython.display as ipd
ipd.Audio(wav, rate=16000)

## Generating WAV using the Vocoder (credits: Corentin Jemine)

In [None]:
from vocoder import Vocoder

vocoder = Vocoder()
vocoder.load_model("models/vocoder/vocoder.pt")

In [None]:
wav_new = vocoder.infer_waveform(spec)

# b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
# b_starts = np.concatenate(([0], b_ends[:-1]))
# wavs = [wav_new[start:end] for start, end, in zip(b_starts, b_ends)]
# breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
# wav_vocoder = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

# wav_vocoder = wav_new / np.abs(wav_new).max() * 0.97    

import IPython.display as ipd
ipd.Audio(wav_new, rate=16000)