## FastSpeech 2 Variant inferendce code

Edit the variables **checkpoint_path**, **text** and **emotion** to match yours and run the entire code to generate wav.

### Config & Utils

In [None]:
import os
import sys
import numpy as np
import pandas as pd
src_path = os.path.join("/", *os.getcwd().split(os.sep)[:-2])
sys.path.append(src_path)
os.chdir(src_path)
from src.dataset.dataset import Dataset
from config.config import TrainConfig
import torch
from lightning import seed_everything
from src.models import Generator, TorchSTFT
from dataclasses import asdict
from src.models.acoustic_model.fastspeech.lightning_model import FastSpeechLightning
from src.utils.vocoder_utils import load_checkpoint, synthesize_wav_from_mel
import IPython.display as ipd

PROJECT_PATH = os.path.join("/", *os.getcwd().split(os.sep)[:-2])
config = TrainConfig()
DATASET = "ESD"
# DATASET = "MEADTTS"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# set checkpoint path, e.g. "ckpt/ESD/model.ckpt"
config.testing_checkpoint = f"{PROJECT_PATH}/EMITTS/FastSpeech2/ckpt/{DATASET}/model.ckpt"
# config.testing_checkpoint = f"{PROJECT_PATH}/EMITTS/FastSpeech2/ckpt/MEADTTS/epoch=559-step=217280.ckpt"
config.phones_path = "../../data/MEADTTS_MFA_preprocessed/phones.json"

def load_data(datalist):
    with open(datalist, encoding='utf-8') as f:
        data = [line.strip().split("|") for line in f]
    return data

### Load Dataset

In [None]:
test_data = Dataset(
        filename="test.txt", cfg=config, batch_size=config.val_batch_size, is_emotion_feature=config.is_emotion_feature
    )
test_sample = test_data[np.random.randint(0, len(test_data))]
test_data_text = load_data(f"../filelist/{DATASET}/esd_en_audio_sid_text_efeature_test_filelist.txt")
# test_data_text = load_data(f"../filelist/{DATASET}/MEADTTS_audio_sid_text_efeature_test_filelist.txt")
test_df = pd.DataFrame(test_data_text, columns=["file_name", "speaker", "text", "emotion_feature"])
test_item = test_df[test_df["file_name"].str.contains(test_sample["id"])]

# MEADTTS
# test_id = ""
# if test_sample["id"][1] == "_":
#     test_id = "W" + str(int(test_sample["id"][:1])).zfill(3) + test_sample["id"][1:]
# elif int(test_sample["id"][:2]) <= 40:
#     test_id = "W" + str(int(test_sample["id"][:2])).zfill(3) + test_sample["id"][2:]
# else:
#     test_id = "M" + str(int(test_sample["id"][:2]) - 40).zfill(3) + test_sample["id"][2:]

# test_item = test_df[test_df["file_name"].str.contains(test_id)]

test_item["text"].values[0], test_item["speaker"].values[0], test_item["emotion_feature"].values[0].split("/")[-1][:-3]

In [None]:
test_sample_input = {
    "ids": [test_sample["id"]],
    "speakers": torch.Tensor([test_sample["speaker"]]).long(),
    "emotions": torch.Tensor([test_sample["emotion"]]).long(),
    "texts": torch.Tensor([test_sample["text"]]).long(),
    "text_lens": torch.Tensor([len(test_sample["text"])]).long(),
    "mels": None,
    "mel_lens": None,
    "pitches": None,
    "energies": None,
    "durations": None,
    "egemap_features": None,
    "emotion_features": torch.Tensor(test_sample["emotion_feature"]).float().unsqueeze(0),
}

### Load Model

In [None]:
seed_everything(config.seed)
vocoder = Generator(**asdict(config))
stft = TorchSTFT(**asdict(config))
vocoder_state_dict = load_checkpoint(config.vocoder_checkpoint_path)
vocoder.load_state_dict(vocoder_state_dict["generator"])
vocoder.remove_weight_norm()
vocoder.eval()
model = FastSpeechLightning.load_from_checkpoint(
    config.testing_checkpoint,
    config=config,
    vocoder=vocoder,
    stft=stft,
    train=False,
)
model.eval()

### Inference

In [None]:
with torch.no_grad():
    model_output = model.model(model.device, test_sample_input)
    predicted_mel_len = model_output["mel_len"][0]
    predicted_mel_no_padding = model_output["predicted_mel"][0, :predicted_mel_len]
    generated_wav = synthesize_wav_from_mel(
        predicted_mel_no_padding, model.vocoder, model.stft
    )
ipd.Audio(generated_wav, rate=config.sample_rate)

Inference All

In [None]:
from tqdm import tqdm
for sample in tqdm(test_data):
    sample_input = {
        "ids": [sample["id"]],
        "speakers": torch.Tensor([sample["speaker"]]).long(),
        "emotions": torch.Tensor([sample["emotion"]]).long(),
        "texts": torch.Tensor([sample["text"]]).long(),
        "text_lens": torch.Tensor([len(sample["text"])]).long(),
        "mels": None,
        "mel_lens": None,
        "pitches": None,
        "energies": None,
        "durations": None,
        "egemap_features": None,
        "emotion_features": torch.Tensor(sample["emotion_feature"]).float().unsqueeze(0),
    }
    with torch.no_grad():
        model_output = model.model(model.device, sample_input)
        predicted_mel_len = model_output["mel_len"][0]
        predicted_mel_no_padding = model_output["predicted_mel"][0, :predicted_mel_len]
        generated_wav = synthesize_wav_from_mel(
            predicted_mel_no_padding, model.vocoder, model.stft
        )
    ipd.Audio(generated_wav, rate=config.sample_rate)
    break