## VITS Variant inference code
Edit the variables **checkpoint_path**, **text** and **emotion feature** to match yours and run the entire code to generate wav.

### Config & Utils

In [None]:
import os
import sys
sys.path.append(os.path.join("/", *os.getcwd().split(os.sep)))
from utils.text_utils import text_to_sequence
import torch
import utils.commons as commons
from utils.utils import get_hparams_from_file, load_checkpoint
from model.models import SynthesizerTrn
from utils.text.symbols import symbols
import IPython.display as ipd

PROJECT_PATH = os.path.join('/', *os.getcwd().split(os.sep)[:-2])
# hps_file, hyperparams file, e.g. "EMITTS/VITS/config/esd_en_e5.json"
hps_file = f"{PROJECT_PATH}/EMITTS/VITS/config/esd_en_e5.json"
hps = get_hparams_from_file(hps_file)
# checkpoint_path, checkpoint file path, e.g. "EMITTS/VITS/ckpt/checkpoint.pth"
checkpoint_path = f"{PROJECT_PATH}/EMITTS/VITS/ckpt/checkpoint.pth"

device = "cuda" if torch.cuda.is_available() else "cpu"

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm


### Load Model

In [None]:
model = SynthesizerTrn(
            len(symbols),
            hps.data.filter_length // 2 + 1,
            hps.train.segment_size // hps.data.hop_length,
            n_speakers=hps.data.n_speakers,
            **hps.model).to(device).eval()
load_checkpoint(checkpoint_path, model, None)

In [None]:
# sid, speaker id, e.g. 0
sid = 4
# text, text to synthesize, e.g. "That I owe my thanks to you."
text = "That I owe my thanks to you."
# emotion_f_path, emotion feature file path
emotion_f_path = f"{PROJECT_PATH}/EPAlign/mmefeature/tmp/explict/happy.pt"

sid = torch.LongTensor([sid]).to(device)
text = get_text(text, hps).unsqueeze(0).to(device)
text_length = torch.LongTensor([text.size(1)]).to(device)
emotion_f = torch.load(emotion_f_path).float().unsqueeze(0).to(device)

### Inference

In [None]:
with torch.no_grad():
    audio = model.infer(text, text_length, sid, emotion_f, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float()
ipd.Audio(audio.numpy(), rate=hps.data.sampling_rate)