In [1]:
import os, sys
from IPython.display import Audio

sys.path.append("../")

import torch
import torchaudio
import librosa
import numpy as np

from src.spk_embedding.StyleEmbedding import StyleEmbedding
from src.tts.vocoders.hifigan.HiFiGAN import HiFiGANGenerator
from src.tts.models.fastspeech2.FastSpeech2 import FastSpeech2
from src.datasets.fastspeech_dataset import (
    FastSpeechDataset,
    build_path_to_transcript_dict_libri_tts,
)
from src.pipelines.gst_fastspeech2.train_loop import collate_and_pad

device = "cuda" if torch.cuda.is_available() else "cpu"
device

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [2]:
TEST_CLEAN_PATH = '../data/test-clean'

AVOCODO_CHECKPOINT = "../saved_models/Avocodo.pt"
ALIGNER_CHECKPOINT = "../saved_models/aligner.pt"
FASTSPEECH2_CHECKPOINT = "../saved_models/best_train_loss_34_steps.pt"
STYLE_EMBED_CHECKPOINT = "../saved_models/embedding_function.pt"

In [3]:
transcript_dict = build_path_to_transcript_dict_libri_tts(TEST_CLEAN_PATH)

In [4]:
dataset = FastSpeechDataset(
    path_to_transcript_dict=None,
    acoustic_checkpoint_path=ALIGNER_CHECKPOINT,  # path to aligner.pt
    cache_dir="./librispeech",
    lang="en",
    loading_processes=2,  # depended on how many CPU you have
    device=device,
)

Prepared a FastSpeech dataset with 18 datapoints in ./librispeech.


In [5]:
vocoder = HiFiGANGenerator().to(device)
avocodo_check_dict = torch.load(AVOCODO_CHECKPOINT, map_location=device)
vocoder.load_state_dict(avocodo_check_dict["generator"])
vocoder.eval()

style_embed_function = StyleEmbedding().to(device)
style_embed_check_dict = torch.load(STYLE_EMBED_CHECKPOINT, map_location=device)
style_embed_function.load_state_dict(style_embed_check_dict["style_emb_func"])
style_embed_function.eval()
style_embed_function.requires_grad_(False)

acoustic_model = FastSpeech2().to(device)
fastspeech2_check_dict = torch.load(FASTSPEECH2_CHECKPOINT, map_location=device)
acoustic_model.load_state_dict(fastspeech2_check_dict["model"])
acoustic_model.eval()



FastSpeech2(
  (encoder): Conformer(
    (embed): Sequential(
      (0): Linear(in_features=62, out_features=100, bias=True)
      (1): Tanh()
      (2): Linear(in_features=100, out_features=384, bias=True)
    )
    (pos_enc): RelPositionalEncoding(
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (output_norm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (hs_emb_projection): Linear(in_features=448, out_features=384, bias=True)
    (language_embedding): Embedding(8000, 384)
    (encoders): MultiSequential(
      (0): EncoderLayer(
        (self_attn): RelPositionMultiHeadedAttention(
          (linear_q): Linear(in_features=384, out_features=384, bias=True)
          (linear_k): Linear(in_features=384, out_features=384, bias=True)
          (linear_v): Linear(in_features=384, out_features=384, bias=True)
          (linear_out): Linear(in_features=384, out_features=384, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear_pos): Linear(in

In [6]:
sample_id = 1
sample = dataset[sample_id]
input_audio_path = sample[-1]
input_wave, _ = librosa.load(input_audio_path)
input_text = transcript_dict[input_audio_path]
batch = collate_and_pad([sample])

print(input_audio_path)
Audio(data=input_wave, rate=16000)

../data/test-clean/61/70968/61-70968-0001.flac


In [7]:
style_embedding = style_embed_function(
    batch_of_spectrograms=batch[2].to(device),
    batch_of_spectrogram_lengths=batch[3].to(device),
)

mel = acoustic_model.inference(
    text=batch[0][0].to(device),
    speech=None,
    alpha=1.0,
    # utterance_embedding=style_embedding[0],
    return_duration_pitch_energy=False,
    lang_id=batch[8][0].to(device),
)

waveform = vocoder(mel.transpose(1, 0))[0]
waveform = waveform.detach().cpu()

AttributeError: 'NoneType' object has no attribute 'unsqueeze'

In [None]:
print(input_text)
Audio(data=waveform, rate=16000)

give not so earnest a mind to these mummeries child



In [None]:
# torchaudio.save(
#     'synth.wav',
#     src=waveform,
#     sample_rate=16000
# )

# torchaudio.save(
#     'origin.wav',
#     src=torch.Tensor(input_wave).unsqueeze(0),
#     sample_rate=16000
# )