## Importing all the required modules and dependencies

In [3]:
import sys
sys.path.append('..')

import torch
from pathlib import Path
import numpy as np
import librosa
from scripts.synthesizer import Synthesizer
from scripts.speech_encoder_v2_updated import SpeechEncoderV2
from data_preprocessing import *
from scripts.embed import Embed

### Preprocessing the audio

In [5]:
from temp.audio import preprocess_wav
wav, sample_rate = librosa.load(r"../test/audio_1.mp3")
wav = preprocess_wav(wav, 16000)

## Initializing the speaker encoder model and loading the checkpoints

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_device = torch.device("cpu")

# encoder = SpeechEncoder(device, loss_device)
encoder = SpeechEncoderV2(device, loss_device)

# checkpoints = torch.load("..\models\speech_encoder_lstm\encoder.pt")
checkpoints = torch.load(
    r"../models/speech_encoder_transformer_updated/encoder_073500_loss_0.0724.pt",
    map_location=device
)

encoder.load_state_dict(checkpoints['model_state'])
embedder = Embed(encoder)



## Generating the speaker embeddings

In [7]:
embedding, partial_embeds, _ = embedder.embed_utterance(wav, return_partials=True)
# embeddings = np.expand_dims(embedding, 0)
text = "Last weekend, I went to the zoo with my family. We saw lions, elephants, and monkeys. The birds were colorful and sang beautiful songs. It was exciting to see so many animals in one place.".split("\n")
embeddings = [embedding] * len(text)

In [8]:
synthesizer_model_path = Path("../models/synthesizer/synthesizer.pt")
synthesizer = Synthesizer(synthesizer_model_path)

synthesizer.load()
specs = synthesizer.synthesize_spectrograms(text, embeddings)
spec = np.concatenate(specs, axis=1)

breaks = [spec.shape[1] for spec in specs]

Synthesizer using device: cpu
Trainable Parameters: 30.870M
Loaded synthesizer "synthesizer.pt" trained to step 295000

| Generating 1/1


Done.



## Generating WAV using the Vocoder (Corentin Jemine)

In [13]:
from scripts.vocoder import Vocoder

vocoder = Vocoder()
vocoder.load_model("../models/vocoder/vocoder.pt")

Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at ../models/vocoder/vocoder.pt


In [14]:
wav_new = vocoder.infer_waveform(spec)

b_ends = np.cumsum(np.array(breaks) * Synthesizer.params.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [wav_new[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
wav_vocoder = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

wav_vocoder = wav_new / np.abs(wav_new).max() * 0.97    

import IPython.display as ipd
ipd.Audio(wav_new, rate=16000)

{| ████████████████ 142500/144000 | Batch Size: 15 | Gen Rate: 7.6kHz | }