# **Sovits (inference)**

Author：Francis Hu

E-mail：francishr@whu.edu.cn

QQ：2235306122

BILIBILI：https://space.bilibili.com/636704927

Repository：https://github.com/Francis-Komizu/VITS-Yosuga 

# Set up

In [None]:
!git clone https://github.com/Francis-Komizu/Sovits
%cd Sovits
!pip install -r requirements.txt
%cd monotonic_align!python setup.py build_ext --inplace%cd ..!mkdir results

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
import torchaudio
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import UnitAudioLoader, UnitAudioCollate
from models import SynthesizerTrn
import requests

from scipy.io.wavfile import write

# Load models

## Load content encoder

In [None]:
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")

  "You are about to download and run code from an untrusted repository. In a future release, this won't "
Downloading: "https://github.com/bshall/hubert/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt" to /root/.cache/torch/hub/checkpoints/hubert-soft-0d54a1f4.pt


  0%|          | 0.00/361M [00:00<?, ?B/s]

## Load generator

In [None]:
!gdown --id 'your model link' --output generator_accelerator.pth # you may change it

hps = utils.get_hparams_from_file("path/to/config")

net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("path/to/generator", net_g, None)

## Load audio

In [None]:
from google.colab import files

uploaded = files.upload()

Saving natsume_0030.wav to natsume_0030.wav


In [None]:
source, sr = torchaudio.load("path/to/wav")
source = source.unsqueeze(0)

## Convert voice

In [None]:
with torch.inference_mode():
    # Extract speech units
    unit = hubert.units(source)
    unit_lengths = torch.LongTensor([unit.size(1)])
    # for multi-speaker inference
    # sid = torch.LongTensor([4])

    # Synthesize audio
    audio = net_g.infer(unit, unit_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1.0)[0][0,0].data.float().numpy()
    # for multi-speaker inference
    # audio = net_g.infer(unit, unit_lengths, sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1.0)[0][0,0].data.float().numpy()


print("Source:")
ipd.display(ipd.Audio(source.squeeze(), rate=hps.data.sampling_rate))
print("Converted:")
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))

torch.Size([1, 269, 256])
tensor([269])
Source:


Converted:


# References

https://github.com/bshall/acoustic-model

https://github.com/jaywalnut310/vits