In [None]:
# 查看GPU配置
# Check GPU configuration
!nvidia-smi

##(2023/07/23) 这个笔记本参考[HWcomss](https://github.com/HWcomss)的版本修改而成，现已可以正常工作。
##(23/07/2023) This notebook is a slightly modified version of [HWcomss](https://github.com/HWcomss)'s notebook, it's working fine now. Many thanks!


In [None]:
#@title STEP 1 复制代码库并安装运行环境
#@markdown #STEP 1 (6 min)
#@markdown ##复制代码库并安装运行环境
#@markdown ##Clone repository & Build environment

!git clone https://github.com/Plachtaa/VITS-fast-fine-tuning.git
!python -m pip install --upgrade --force-reinstall regex
!python -m pip install --force-reinstall soundfile
!python -m pip install --force-reinstall gradio
!python -m pip install imageio==2.4.1
!python -m pip install --upgrade youtube-dl
!python -m pip install moviepy
%cd VITS-fast-fine-tuning

!python -m pip install --no-build-isolation -r requirements.txt
!python -m pip install --upgrade numpy
!python -m pip install --upgrade --force-reinstall numba
!python -m pip install --upgrade Cython

!python -m pip install --upgrade pyzmq
!python -m pip install pydantic==1.10.4
!python -m pip install ruamel.yaml
!python -m pip install git+https://github.com/openai/whisper.git

# build monotonic align
%cd monotonic_align/
!mkdir monotonic_align
!python setup.py build_ext --inplace
%cd ..
!mkdir pretrained_models
# download data for fine-tuning
!wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/sampled_audio4ft_v2.zip
!unzip sampled_audio4ft_v2.zip
# create necessary directories
!mkdir video_data
!mkdir raw_audio
!mkdir denoised_audio
!mkdir custom_character_voice
!mkdir segmented_character_voice

In [None]:
#@title STEP 1.5 选择预训练模型
#@markdown ###STEP 1.5 选择预训练模型
#@markdown ###Choose pretrained model to start
#@markdown CJE为中日英三语模型，CJ为中日双语模型，C为纯中文模型

#@markdown CJE for Chinese, Japanese & English model，CJ for Chinese & Japanese model
PRETRAINED_MODEL = "CJE" #@param ["CJE","CJ","C"]
if PRETRAINED_MODEL == "CJ":
  !wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/D_0-p.pth -O ./pretrained_models/D_0.pth
  !wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/G_0-p.pth -O ./pretrained_models/G_0.pth
  !wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/config.json -O ./configs/finetune_speaker.json
elif PRETRAINED_MODEL == "CJE":
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/D_trilingual.pth -O ./pretrained_models/D_0.pth
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth -O ./pretrained_models/G_0.pth
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/configs/uma_trilingual.json -O ./configs/finetune_speaker.json
elif PRETRAINED_MODEL == "C":
  !wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/D_0.pth -O ./pretrained_models/D_0.pth
  !wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/G_0.pth -O ./pretrained_models/G_0.pth
  !wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/config.json -O ./configs/finetune_speaker.json

In [None]:
#@title （可选）加载Google云端硬盘 / Mount Google drive
#@title (optional)

#@markdown 加载Google云端硬盘（更快地上传数据集文件）

#@markdown Mount Google drive for faster data upload
from google.colab import drive
drive.mount('/content/drive')

## STEP 3 自动处理所有上传的数据

In [None]:
!cp /content/VITS-fast-fine-tuning/configs/modified_finetune_speaker.json \
    /content/VITS-fast-fine-tuning/configs/finetune_speaker.json


In [None]:
!find ./custom_character_voice -name "*.wav" | wc -l


In [None]:
!rm -rf /content/VITS-fast-fine-tuning
!cp -r "/content/drive/MyDrive/dissertation project/dissertation_note/VITS-fast-fine-tuning" /content/


In [None]:
%cd /content/VITS-fast-fine-tuning



In [None]:
!rm -rf custom_character_voice/*


In [None]:
!rm -rf custom_character_voice/vits_ready_data/
!mv "/content/drive/MyDrive/dissertation project/vits_ready_data" custom_character_voice/



In [None]:
!ls custom_character_voice

In [None]:
!mv custom_character_voice/vits_ready_data/spk* custom_character_voice/
!rm -r custom_character_voice/vits_ready_data/


In [None]:
!find custom_character_voice -name "*.wav" | wc -l


In [None]:
import os

root_dir = "custom_character_voice"
output_file = "short_character_anno.txt"

all_lines = []
speaker_names = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

for speaker in speaker_names:
    speaker_dir = os.path.join(root_dir, speaker)
    metadata_path = os.path.join(speaker_dir, "metadata.csv")
    if not os.path.exists(metadata_path):
        print(f"！缺少 metadata.csv：{metadata_path}")
        continue

    with open(metadata_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                filename, text = line.split("|")
                audio_path = os.path.join(speaker_dir, filename)
                formatted = f"{audio_path}|{speaker}|[EN]{text}[EN]\n"
                all_lines.append(formatted)
            except ValueError:
                print(f"！跳过格式错误的行：{line}")

with open(output_file, "w", encoding="utf-8") as f:
    f.writelines(all_lines)

print(f"已写入 {len(all_lines)} 条标注到 {output_file}")


In [None]:
import torchaudio
waveform, sr = torchaudio.load("custom_character_voice/spk1/0000.wav")
print("当前采样率:", sr)


In [None]:
!python scripts/resample.py

In [None]:
#@markdown 运行该单元格会对所有上传的数据进行自动去背景音&标注。
#@markdown 由于需要调用Whisper和Demucs，运行时间可能较长。

#@markdown Running this codeblock will perform automatic vocal seperation & annotation.
#@markdown Since this step uses Whisper & Demucs, it may take a while to complete.
# 将所有视频（无论是上传的还是下载的，且必须是.mp4格式）抽取音频
# %run scripts/video2audio.py
# # 将所有音频（无论是上传的还是从视频抽取的，必须是.wav格式）去噪
# !python scripts/denoise_audio.py
# # 分割并标注长音频
# !python scripts/long_audio_transcribe.py --languages "{PRETRAINED_MODEL}" --whisper_size large-v2
# 标注短音频
# !python scripts/short_audio_transcribe.py --languages "{PRETRAINED_MODEL}" --whisper_size large-v2
!python scripts/short_audio_transcribe.py --languages en --whisper_size large-v2


# 底模采样率可能与辅助数据不同，需要重采样
!python scripts/resample.py

In [None]:
import os
import json
import torchaudio
from tqdm import tqdm

def resample_all_custom_voice():
    # 读取目标采样率（如 22050）
    with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
        hps = json.load(f)
    target_sr = hps['data']['sampling_rate']

    root_dir = "./custom_character_voice"
    speakers = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

    for speaker in speakers:
        speaker_dir = os.path.join(root_dir, speaker)
        wav_files = [f for f in os.listdir(speaker_dir) if f.endswith(".wav")]

        print(f"\n📢 正在处理 speaker：{speaker}，共 {len(wav_files)} 个音频文件")
        for wavfile in tqdm(wav_files):
            wav_path = os.path.join(speaker_dir, wavfile)
            try:
                wav, sr = torchaudio.load(wav_path)
                if sr != target_sr:
                    wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
                    torchaudio.save(wav_path, wav, target_sr)
            except Exception as e:
                print(f"！处理失败: {wav_path}，原因: {e}")

    print("Yes！ 所有音频重采样完成！")

if __name__ == "__main__":
    resample_all_custom_voice()


#！！！训练质量相关：实验发现目前使用CJ模型+勾选ADD_AUXILIARY，对于中/日均能训练出最好的效果，第一次训练建议默认使用该组合！！！

In [None]:
!pip install -U numpy unidecode pyopenjtalk jamo ko-pron cn2an pypinyin jieba indic-transliteration librosa scipy tqdm inflect num2words eng-to-ipa opencc-python-reimplemented beautifulsoup4 requests phonemizer


In [None]:
! pip install num-thai


In [None]:
#@markdown ##STEP 3.5
#@markdown 运行该单元格会生成划分好训练/测试集的最终标注，以及配置文件

#@markdown Running this block will generate final annotations for training & validation, as well as config file.

#@markdown 选择是否加入辅助训练数据：/ Choose whether to add auxiliary data:
ADD_AUXILIARY = False #@param {type:"boolean"}
#@markdown 辅助训练数据是从预训练的大数据集抽样得到的，作用在于防止模型在标注不准确的数据上形成错误映射。

#@markdown Auxiliary data is to prevent overfitting when the audio samples are small or with low quality.

#@markdown 以下情况请勾选：

#@markdown 总样本少于100条/样本质量一般或较差/样本来自爬取的视频

#@markdown 以下情况可以不勾选：

#@markdown 总样本量很大/样本质量很高/希望加速训练/只有二次元角色

# assert(not (ADD_AUXILIARY and PRETRAINED_MODEL != "CJE")), "add auxiliary data is available only available for CJE model!"
if ADD_AUXILIARY:
  %run preprocess_v2.py --add_auxiliary_data True --languages "{PRETRAINED_MODEL}"
else:
  %run preprocess_v2.py --languages "{PRETRAINED_MODEL}"

## STEP 4 开始训练

In [None]:
!mkdir -p /content/VITS-fast-fine-tuning/pretrained_models


In [None]:
# 删除损坏文件
!rm -f /content/VITS-fast-fine-tuning/pretrained_models/G_0.pth

# 重新下载 CJE 模型
!wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth \
     -O /content/VITS-fast-fine-tuning/pretrained_models/G_0.pth


In [None]:
!ls -lh /content/VITS-fast-fine-tuning/pretrained_models/G_0.pth


In [None]:
!wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/configs/uma_trilingual.json \
     -O configs/finetune_speaker.json


In [None]:
!wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/D_trilingual.pth \
     -O ./pretrained_models/D_0.pth


In [None]:
!rm -rf /content/VITS-fast-fine-tuning/__pycache__
!find /content/VITS-fast-fine-tuning -type d -name "__pycache__" -exec rm -rf {} +


In [None]:
#@markdown #STEP 4 (>=20 min)
#@markdown 开始微调模型。
#@markdown 训练时长取决于你录入/上传的音频总数。

#@markdown 根据声线和样本质量的不同，所需的训练epochs数也不同。

#@markdown 你也可以在Tensorboard中预览合成效果，若效果满意可提前停止。

#@markdown Model fine-tuning
#@markdown Total time cost depends on the number of voices you recorded/uploaded.

#@markdown Best epoch number varies depending on different uploaded voices / sample quality.

#@markdown You can also preview synthezied audio in Tensorboard, it's OK to shut down training manually if you find the quality is satisfying.
import os
os.environ['TENSORBOARD_BINARY'] = '/usr/local/bin/tensorboard'

if os.path.exists("/content/drive/MyDrive/"):
  !python scripts/rearrange_speaker.py
  !cp ./finetune_speaker.json ../drive/MyDrive/finetune_speaker.json
  !cp ./moegoe_config.json ../drive/MyDrive/moegoe_config.json

%reload_ext tensorboard
%tensorboard --logdir "./OUTPUT_MODEL"
Maximum_epochs = "200" #@param {type:"string"}
#@markdown 继续之前的模型训练/Continue training from previous checkpoint
CONTINUE = True #@param {type:"boolean"}
if CONTINUE:
  !python finetune_speaker_v2.py -m "./OUTPUT_MODEL" --max_epochs "{Maximum_epochs}" --drop_speaker_embed False --cont True
else:
  !python finetune_speaker_v2.py -m "./OUTPUT_MODEL" --max_epochs "{Maximum_epochs}" --drop_speaker_embed True

In [None]:
!pip install pydantic==1.10.13 --force-reinstall
!pip install gradio==3.41.2  # 可选，用于确保配套版本


# STEP 5 下载模型
## 本地部署方法请见[README](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/README_ZH.md)

In [None]:
#@markdown ### 下载选项1：运行该单元格，浏览器会自动下载模型和配置文件
#@markdown ### Download option 1: Running this codeblock will download model & config files by your browser.
!python scripts/rearrange_speaker.py
%run scripts/download_model.py

In [None]:
#@markdown ### 下载选项2：运行该单元格会将模型和配置文件保存到Google云端硬盘
#@markdown ### Download option 2: Running this codeblock will save the mode & config files to your Google drive.
!python scripts/rearrange_speaker.py
!cp ./G_latest.pth ../drive/MyDrive/G_latest.pth
!cp ./finetune_speaker.json ../drive/MyDrive/finetune_speaker.json
!cp ./moegoe_config.json ../drive/MyDrive/moegoe_config.json

In [None]:
#@markdown ### 运行该单元格会清空所有已上传的样本，需要时可使用
#@markdown ### Running this codeblock will delete all voice samples you have uploaded. Use it if you need.
!rm -rf ./custom_character_voice/*
!rm -rf ./video_data/*
!rm -rf ./raw_audio/*
!rm -rf ./denoised_audio/*
!rm -rf ./segmented_character_voice/*
!rm -rf long_character_anno.txt
!rm -rf short_character_anno.txt

In [None]:
#@markdown ### 运行该单元格会将切片和标注复制到谷歌云端硬盘根目录下名为`voice_data`的文件夹下以用作其它用途
#@markdown ### Running this codeblock will copy all processed voices & annotations to a folder named `voice_data` under the root of Google Drive for other purpose of usage
!mkdir ../drive/MyDrive/voice_data/
!cp -rf ./custom_character_voice/ ../drive/MyDrive/voice_data/custom_character_voice/
!cp -rf ./segmented_character_voice/ ../drive/MyDrive/voice_data/segmented_character_voice/
!cp long_character_anno.txt ../drive/MyDrive/voice_data/long_character_anno.txt
!cp short_character_anno.txt ../drive/MyDrive/voice_data/short_character_anno.txt