# Requirment

In [1]:
%cd G:\AI\VITS_WebUI\monotonic_align
!python setup.py build_ext --inplace
%cd ..

G:\AI\VITS_WebUI\monotonic_align
running build_ext
copying build\lib.win-amd64-3.9\monotonic_align\core.cp39-win_amd64.pyd -> monotonic_align
G:\AI\VITS_WebUI


In [2]:
!nvidia-smi

Thu Mar  9 20:32:31 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060       WDDM | 00000000:01:00.0  On |                  N/A |
|  0%   30C    P8               19W / 170W|   1508MiB / 12288MiB |      3%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Settings

In [3]:
#@title  Edit config
import json
batchsize = 16  #@param {type:"number"}
training_files = "filelists/yuuka_train.txt.cleaned" #@param {type:"string"}
validation_files = "filelists/yuuka_val.txt.cleaned" #@param {type:"string"}
config = json.load(open("configs/config.json"))
config['train']['batch_size'] = batchsize
config['data']['training_files'] = training_files
config['data']['validation_files'] = validation_files
with open("configs/config.json", 'w+') as f:
    json.dump(config, f, indent=4)

# GUI

In [4]:
import gradio as gr
import numpy as np
import tempfile

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import json
import math
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write


In [6]:
def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [7]:
config_path = "configs/config.json" 
model_path = "G:\AI\Model\VITS\Yuuka\G_4000.pth"

hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
    len(hps.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()

model = net_g.eval()
model = utils.load_checkpoint(model_path, net_g, None)

INFO:root:Loaded checkpoint 'G:\AI\Model\VITS\Yuuka\G_4000.pth' (iteration 445)


In [8]:
import json


with open("models/model_info.json") as f:
    models_index = json.load(f)

models = []

for model_info in models_index:
    #model_path = model_info["model_path"]
    #config_path = model_info["config_path"]
    model_name = model_info["name_en"]
    #model_description = model_info["model_description"]
    model_cover = model_info["cover"]
    model = load_model(model_path)
    models.append({"model_name": model_name, "model_description": model_description, "model_image": model_image, "model": model})





UnicodeDecodeError: 'gbk' codec can't decode byte 0xa6 in position 57: illegal multibyte sequence

In [9]:
LANGUAGES = ['EN','CN','JP']
speaker_id = 0

In [10]:
def tts_fn(text, noise_scale, noise_scale_w, length_scale):  
  stn_tst = get_text(text, hps)
  with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([speaker_id]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
  return  (22050, audio)

In [11]:
download_audio_js = """
() =>{{
    let root = document.querySelector("body > gradio-app");
    if (root.shadowRoot != null)
        root = root.shadowRoot;
    let audio = root.querySelector("#tts-audio-{audio_id}").querySelector("audio");
    let text = root.querySelector("#input-text-{audio_id}").querySelector("textarea");
    if (audio == undefined)
        return;
    text = text.value;
    if (text == undefined)
        text = Math.floor(Math.random()*100000000);
    audio = audio.src;
    let oA = document.createElement("a");
    oA.download = text.substr(0, 20)+'.wav';
    oA.href = audio;
    document.body.appendChild(oA);
    oA.click();
    oA.remove();
}}
"""

In [12]:
from gradio.processing_utils import download_tmp_copy_of_file

with gr.Blocks() as interface:
  with gr.Tab("Text to Speech"):
    with gr.Row():
      input_text = gr.Textbox(
          label="Input", 
          lines=5,
          placeholder="Enter the text you want to process here")
      gen_button = gr.Button("Generate")
    with gr.Row():
      with gr.Column():
        lan = [gr.Radio(label="Language", choices=LANGUAGES, value="JP")]
        noise_scale = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, label = "Noise Scale (情感变化程度)", value = 0.6)
        noise_scale_w = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, label = "Noise Scale w (发音长度)", value = 0.668)
        length_scale = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, label = "Length Scale (语速)", value=1.0)

      with gr.Column():
        output_audio = gr.Audio(label="Output")
        download_button = gr.Button("Download")
    
    gen_button.click(
        tts_fn,
        inputs = [input_text, noise_scale, noise_scale_w, length_scale],
        outputs = output_audio
        )
    #download_button.click(None, [], [], _js=download_audio_js.format(audio_id=f"en-{name_en.replace(' ', '')}"))


  with gr.Tab("TTS with ChatGPT"):
    input_text_gpt = gr.Textbox()


  with gr.Tab("Settings"):
    model_name = gr.Dropdown(label = "model")



interface.launch()

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): checkip.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://checkip.amazonaws.com:443 "GET / HTTP/1.1" 200 14
DEBUG:charset_normalizer:Encoding detection: ascii is most likely the one.
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.gradio.app:443
DEBUG:asyncio:Using selector: SelectSelector
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 127.0.0.1:7860
DEBUG:urllib3.connectionpool:http://127.0.0.1:7860 "GET /startup-events HTTP/1.1" 200 5
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 127.0.0.1:7860
DEBUG:urllib3.connectionpool:http://127.0.0.1:7860 "HEAD / HTTP/1.1" 200 0
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.gradio.app:443


In [None]:
css_style = """
    .gradio-input { grid-area: input; }
    .gradio-output { grid-area: output; }
    .gradio-control { grid-area: control; }
    .gradio-interface { 
        display: grid;
        grid-template-columns: 1fr 1fr;
        grid-template-rows: 1fr auto;
        grid-template-areas: 
            "input input"
            "control output";
    }
"""

# 载入模型

In [None]:
config_path = "configs/config.json" #@param {type:"string"}
model_path = "../drive/MyDrive/vits-finetune/checkpoints/G_4000.pth" #@param {type:"string"}

hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
    len(hps.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()

model = net_g.eval()
model = utils.load_checkpoint(model_path, net_g, None)

In [None]:
speaker_id = 0 #@param {type:"number"}
text = "\u30CB\u30B8\u30E7\u30B9\u30B0\u30B8\u30D0\uFF0C\u30D3\u30A8\u30B6\u30A4\u30BC\u30EA\u30D5\u30A1\u30C7\u30F3\u5148\u751F\u3002" #@param {type:"string"}
noise_scale=0.7 #@param {type:"number"}
noise_scale_w=0.668 #@param {type:"number"}
length_scale=1.0 #@param {type:"number"}
stn_tst = get_text(text, hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([speaker_id]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()

ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [None]:
text_to_speech = Category(
    title="Text to Speech",

)


categories = [text_to_speech,settings]

NameError: ignored

In [None]:
def text_to_speech(x):
    return x[::-1]


def flip_image(x):
    return np.fliplr(x)

with gr.Blocks() as interface:
    with gr.Tab("Text to Speech"):
        text_input = gr.Textbox(lines=5, placeholder="Enter the text you want to process here")
        text_output = gr.Textbox()
        text_button = gr.Button("Flip")
    with gr.Tab("Flip Image"):
        with gr.Row():
            image_input = gr.Image()
            image_output = gr.Image()
        image_button = gr.Button("Flip")

    text_button.click(text_to_speech, inputs=text_input, outputs=text_output)
    image_button.click(flip_image, inputs=image_input, outputs=image_output)



In [None]:
interface.launch()