In [None]:
# Inference Notebook for Persian VITS (Single Speaker - Amir)
# Setup environment

import os
import sys
import torch
import matplotlib.pyplot as plt
import IPython.display as ipd

sys.path.append("vits")

import commons
import utils
from models import SynthesizerTrn
from text import text_to_sequence
from text.cleaners_fa import persian_cleaners
from text.symbols_fa import symbols

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load config & model checkpoint
# Change paths as needed
checkpoint_path = "logs/fa_amir/G_10000.pth"   # replace with latest G_*.pth
config_path = "configs/fa_single_speaker.json"

hps = utils.get_hparams_from_file(config_path)

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model
).to(device)

_ = net_g.eval()

_ = utils.load_checkpoint(checkpoint_path, net_g, None)

# Text processing (Persian)
def infer_text(text, noise_scale=0.667, length_scale=1.0):
    """Convert Persian text to sequence and synthesize speech."""
    text = persian_cleaners(text)
    seq = text_to_sequence(text, hps.data.text_cleaners)
    x = torch.LongTensor(seq).unsqueeze(0).to(device)
    x_lengths = torch.LongTensor([x.size(1)]).to(device)

    with torch.no_grad():
        y_hat, *_ = net_g.infer(x, x_lengths, noise_scale=noise_scale, length_scale=length_scale)
    return y_hat[0][0].cpu().numpy()

# Run inference
sample_texts = [
    "سلام! حال شما چطوره؟",
    "امروز یک روز خوب برای آزمایش مدل گفتار است.",
    "این یک نمونه صدا از مدل وی‌آی‌تی‌اس فارسی است."
]

for i, txt in enumerate(sample_texts, 1):
    print(f"📝 Text {i}: {txt}")
    audio = infer_text(txt)
    display(ipd.Audio(audio, rate=hps.data.sampling_rate))
