# Requirment

In [1]:
%cd G:\AI\VITS_WebUI\monotonic_align
!python setup.py build_ext --inplace
%cd ..

G:\AI\VITS_WebUI\monotonic_align
running build_ext
copying build\lib.win-amd64-3.9\monotonic_align\core.cp39-win_amd64.pyd -> monotonic_align
G:\AI\VITS_WebUI


In [2]:
!nvidia-smi

Mon Apr  3 13:57:21 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060       WDDM | 00000000:01:00.0  On |                  N/A |
|  0%   29C    P8               19W / 170W|   5118MiB / 12288MiB |     17%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Settings

In [3]:
#@title  Edit config
import json
batchsize = 16  #@param {type:"number"}
training_files = "filelists/yuuka_train.txt.cleaned" #@param {type:"string"}
validation_files = "filelists/yuuka_val.txt.cleaned" #@param {type:"string"}
config = json.load(open("configs/config.json"))
config['train']['batch_size'] = batchsize
config['data']['training_files'] = training_files
config['data']['validation_files'] = validation_files
with open("configs/config.json", 'w+') as f:
    json.dump(config, f, indent=4)

# GUI

In [4]:
import gradio as gr
import numpy as np
import tempfile

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import json
import math
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write
from gradio.processing_utils import download_tmp_copy_of_file

In [6]:
def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [7]:
config_path = "configs/config.json" 
model_path = "G:\AI\Model\VITS\Yuuka\G_4000.pth"

hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
    len(hps.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()

model = net_g.eval()
model = utils.load_checkpoint(model_path, net_g, None)

INFO:root:Loaded checkpoint 'G:\AI\Model\VITS\Yuuka\G_4000.pth' (iteration 445)


In [8]:
LANGUAGES = ['EN','CN','JP']
speaker_id = 0
cover = "models/Yuuka/cover.png"


In [9]:
with open("models/model_info.json", "r", encoding="utf-8") as f:
    models_info = json.load(f)

for i,model_info in models_info.items():
    name_en = model_info['name_en']

print(name_en)


Mika


In [10]:
print(models_info['yuuka']['sid'])

0


In [11]:
download_audio_js = """
() =>{{
    let root = document.querySelector("body > gradio-app");
    if (root.shadowRoot != null)
        root = root.shadowRoot;
    let audio = root.querySelector("#tts-audio-{audio_id}").querySelector("audio");
    let text = root.querySelector("#input-text-{audio_id}").querySelector("textarea");
    if (audio == undefined)
        return;
    text = text.value;
    if (text == undefined)
        text = Math.floor(Math.random()*100000000);
    audio = audio.src;
    let oA = document.createElement("a");
    oA.download = text.substr(0, 20)+'.wav';
    oA.href = audio;
    document.body.appendChild(oA);
    oA.click();
    oA.remove();
}}
"""

In [12]:
def tts_fn(text, noise_scale, noise_scale_w, length_scale):
  stn_tst = get_text(text, hps)
  with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([speaker_id]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
  return  (22050, audio)

In [17]:
from werkzeug.utils import secure_filename


def add_model_fn(model_path, Image, SpeakerID, name_en, name_cn, language):

    #获取用户输入
    new_model = {
        "name_en": name_en,
        "name_zh": name_cn,
        "cover": Image,
        "sid": SpeakerID,
        "example": "それに新しいお菓子屋さんも出来てみんな買いものを楽しんでいます！",
        "language": language,
        "type": "single",
        "model_path": model_path
    }

    # 检查必填字段是否为空
    if not file.data or not speaker_id or not name_en or not language:
        gr.Interface.error("Please fill in all required fields!")
        return




    # 保存上传的文件
    file_data = file.data[0]
    filename = secure_filename(file_data.name)
    filepath = os.path.join("models", name_en, filename)
    os.makedirs(os.path.dirname(filepath), exist_ok=True)


    with open("models/model_info.json", "r", encoding="utf-8") as f:
        models_info = json.load(f)

    models_info[name_en] = new_model
    with open("models.json", "w") as f:
        json.dump(models_info, f)


In [30]:
theme = gr.themes.Base()

with gr.Blocks(theme=theme) as interface:
    with gr.Tab("Text to Speech"):
        with gr.Column():
            gr.Markdown(
                '<div align="center">'
                f'<img style="width:auto;height:512px;" src="file/{cover}">' if cover else ""
                                                                                           '</div>')

            with gr.Row():
                with gr.Column(scale = 4):
                    input_text = gr.Textbox(
                        label="Input",
                        lines=2,
                        placeholder="Enter the text you want to process here",
                        elem_id=f"input-text-en-{name_en.replace(' ','')}",
                        scale = 2
                    )
                with gr.Column(scale = 1):
                    gen_button = gr.Button("Generate", variant="primary")
                    clear_input_button = gr.Button("Clear")

            with gr.Row():
                with gr.Column(scale = 2):
                    lan = [gr.Radio(label="Language", choices=LANGUAGES, value="JP")]
                    noise_scale = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, label = "Noise Scale (情感变化程度)", value = 0.6)
                    noise_scale_w = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, label = "Noise Scale w (发音长度)", value = 0.668)
                    length_scale = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, label = "Length Scale (语速)", value=1.0)

                with gr.Column(scale = 1):
                    output_audio = gr.Audio(label="Output", elem_id=f"tts-audio-en-{name_en.replace(' ','')}")
                    download_button = gr.Button("Download")

        #clear_input_button.click()
        gen_button.click(
                    tts_fn,
                    inputs = [input_text, noise_scale, noise_scale_w, length_scale],
                    outputs = output_audio)
        download_button.click(None, [], [], _js=download_audio_js.format(audio_id=f"en-{name_en.replace(' ', '')}"))

#------------------------------------------------------------------------------------------------------------------------
    with gr.Tab("AI Singer"):
        input_text_singer = gr.Textbox()


#------------------------------------------------------------------------------------------------------------------------
    with gr.Tab("TTS with ChatGPT"):
        input_text_gpt = gr.Textbox()



#------------------------------------------------------------------------------------------------------------------------
    with gr.Tab("Settings"):
        with gr.Box():
            gr.Markdown("""# Select Model""")
            with gr.Row():
                model_choice = gr.Dropdown(label = "Model",
                                           choices=[(model["name_en"]) for name, model in models_info.items()],
                                           interactive=True,
                                           value=models_info['yuuka']['name_en']
                                         )
                speaker_id = gr.Dropdown(label = "Speaker ID",
                                         choices=[(str(model["sid"])) for name, model in models_info.items()],
                                         interactive=True,
                                         value=str(models_info['yuuka']['sid'])
                                         )
        with gr.Box():
            gr.Markdown("# Add Model\n"
                        "> *为必填选项"
                        )


            with gr.Row():
                file = gr.Files(label = "VITS Model*")
                model_cover = gr.Image()

                with gr.Column():
                    model_speaker_id = gr.Textbox(label = "Speaker List*",
                                                  placeholder="Single speaker model default=0")
                    model_name_en = gr.Textbox(label = "name_en*")
                    model_name_cn = gr.Textbox(label = "name_cn")
                    model_language = gr.Dropdown(label = "Language*",
                                               choices=LANGUAGES,
                                               interactive=True)
                    with gr.Row():
                        add_model_button = gr.Button("Add Model", variant="primary")
                        clear_add_model_button = gr.Button("Clear")

        add_model_button.click(add_model_fn,
                               inputs = [file, model_cover, model_speaker_id, model_name_en, model_name_cn, model_language]
                               )








interface.queue(concurrency_count=1).launch()


DEBUG:markdown_it.rules_block.code:entering code: StateBlock(line=0,level=0,tokens=0), 0, 1, False
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.gradio.app:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:markdown_it.rules_block.fence:entering fence: StateBlock(line=0,level=0,tokens=0), 0, 1, False
DEBUG:markdown_it.rules_block.blockquote:entering blockquote: StateBlock(line=0,level=0,tokens=0), 0, 1, False
DEBUG:markdown_it.rules_block.hr:entering hr: StateBlock(line=0,level=0,tokens=0), 0, 1, False
DEBUG:markdown_it.rules_block.list:entering list: StateBlock(line=0,level=0,tokens=0), 0, 1, False
DEBUG:markdown_it.rules_block.reference:entering reference: StateBlock(line=0,level=0,tokens=0), 0, 1, False
DEBUG:markdown_it.rules_block.html_block:entering html_block: StateBlock(line=0,level=0,tokens=0), 0, 1, False




TypeError: __call__() missing 1 required positional argument: 'fn'

In [11]:
# 定义函数，用于从 json 文件中读取所有模型的名称
def get_model_names():
    with open("models.json", "r") as f:
        models = json.load(f)
    return list(models.keys())



# 定义函数，用于向 json 文件中添加一个新的模型
def add_model(model_name, model_info):
    with open("models.json", "r") as f:
        models = json.load(f)
    models[model_name] = model_info
    with open("models.json", "w") as f:
        json.dump(models, f)

model_names = get_model_names()







FileNotFoundError: [Errno 2] No such file or directory: 'models.json'

In [None]:
css_style = """
    .gradio-input { grid-area: input; }
    .gradio-output { grid-area: output; }
    .gradio-control { grid-area: control; }
    .gradio-interface { 
        display: grid;
        grid-template-columns: 1fr 1fr;
        grid-template-rows: 1fr auto;
        grid-template-areas: 
            "input input"
            "control output";
    }
"""

# 载入模型

In [None]:
config_path = "configs/config.json" #@param {type:"string"}
model_path = "../drive/MyDrive/vits-finetune/checkpoints/G_4000.pth" #@param {type:"string"}

hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
    len(hps.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()

model = net_g.eval()
model = utils.load_checkpoint(model_path, net_g, None)

In [None]:
speaker_id = 0 #@param {type:"number"}
text = "\u30CB\u30B8\u30E7\u30B9\u30B0\u30B8\u30D0\uFF0C\u30D3\u30A8\u30B6\u30A4\u30BC\u30EA\u30D5\u30A1\u30C7\u30F3\u5148\u751F\u3002" #@param {type:"string"}
noise_scale=0.7 #@param {type:"number"}
noise_scale_w=0.668 #@param {type:"number"}
length_scale=1.0 #@param {type:"number"}
stn_tst = get_text(text, hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([speaker_id]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()

ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [None]:
def text_to_speech(x):
    return x[::-1]


def flip_image(x):
    return np.fliplr(x)

with gr.Blocks() as interface:
    with gr.Tab("Text to Speech"):
        text_input = gr.Textbox(lines=5, placeholder="Enter the text you want to process here")
        text_output = gr.Textbox()
        text_button = gr.Button("Flip")
    with gr.Tab("Flip Image"):
        with gr.Row():
            image_input = gr.Image()
            image_output = gr.Image()
        image_button = gr.Button("Flip")

    text_button.click(text_to_speech, inputs=text_input, outputs=text_output)
    image_button.click(flip_image, inputs=image_input, outputs=image_output)



In [None]:
interface.launch()