<a href="https://colab.research.google.com/github/KevinWang676/Retrieval-based-Voice-Conversion-New/blob/main/Voice_Cloning_for_Chinese_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 中文声音克隆 Voice Cloning for Chinese

## 配置环境 Set up

In [None]:
! nvidia-smi # 需要使用GPU运行

In [1]:
! pip install openai-whisper
! pip install modelscope
! pip install tts-autolabel -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
! pip install typeguard==2.3.1
! pip install sox
! pip install bitstring
! pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
! pip install pytorch_wavelets
! pip install tensorboardX
! git clone https://github.com/fbcotter/pytorch_wavelets # 并点击左下角终端，执行：(1) cd pytorch_wavelets 和 (2) pip install .

Collecting openai-whisper
  Downloading openai-whisper-20230314.tar.gz (792 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m792.9/792.9 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken==0.3.1 (from openai-whisper)
  Downloading tiktoken-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python==0.2.0 (from openai-whisper)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20230314-py3-none-any.whl size=796910 sha256=ff47

Looking in links: https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
Collecting tts-autolabel
  Downloading https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/kantts/tts_autolabel-1.1.2-py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.3/80.3 kB[0m [31m592.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnxruntime (from tts-autolabel)
  Downloading onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard (from tts-autolabel)
  Downloading typeguard-4.0.0-py3-none-any.whl (33 kB)
Collecting kaldi-native-fbank (from tts-autolabel)
  Downloading kaldi_native_fbank-1.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.5/189.5 kB[0m [31m23.4 MB/s[0m eta [36m0:

Collecting typeguard==2.3.1
  Downloading typeguard-2.3.1-py3-none-any.whl (10 kB)
Installing collected packages: typeguard
  Attempting uninstall: typeguard
    Found existing installation: typeguard 4.0.0
    Uninstalling typeguard-4.0.0:
      Successfully uninstalled typeguard-4.0.0
Successfully installed typeguard-2.3.1
Collecting sox
  Downloading sox-1.4.1-py2.py3-none-any.whl (39 kB)
Installing collected packages: sox
Successfully installed sox-1.4.1
Collecting bitstring
  Downloading bitstring-4.0.2-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitstring
Successfully installed bitstring-4.0.2
Looking in links: https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
Collecting kantts
  Downloading https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/kantts/kantts-1.0.1-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━

In [None]:
import os
os._exit(00) # 重启notebook

## 对音频切片处理

In [2]:
import subprocess
from pathlib import Path
import librosa
from scipy.io import wavfile
import numpy as np
import torch
import csv
import whisper

def split_long_audio(model, filepaths, character_name, save_dir="data_dir", out_sr=44100):
    if isinstance(filepaths, str):
        filepaths = [filepaths]

    for file_idx, filepath in enumerate(filepaths):

        save_path = Path(save_dir) / character_name
        save_path.mkdir(exist_ok=True, parents=True)

        print(f"Transcribing file {file_idx}: '{filepath}' to segments...")
        result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5)
        segments = result['segments']

        wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True)
        wav, _ = librosa.effects.trim(wav, top_db=20)
        peak = np.abs(wav).max()
        if peak > 1.0:
            wav = 0.98 * wav / peak
        wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr)
        wav2 /= max(wav2.max(), -wav2.min())

        for i, seg in enumerate(segments):
            start_time = seg['start']
            end_time = seg['end']
            wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)]
            wav_seg_name = f"{character_name}_{file_idx}_{i}.wav"
            out_fpath = save_path / wav_seg_name
            wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16))

In [3]:
whisper_size = "medium"
whisper_model = whisper.load_model(whisper_size)

100%|██████████████████████████████████████| 1.42G/1.42G [00:13<00:00, 113MiB/s]


In [4]:
split_long_audio(whisper_model, "nana_speech.wav", "test", "dataset_raw") # 请在{filename}处填写您上传的wav文件名

Transcribing file 0: 'nana_speech.wav' to segments...


## 需要新建三个文件夹，分别是："test_wavs", "output_training_data", "pretrain_work_dir"

In [5]:
! mv  -v ./dataset_raw/test/* ./test_wavs/

renamed './dataset_raw/test/test_0_0.wav' -> './test_wavs/test_0_0.wav'
renamed './dataset_raw/test/test_0_10.wav' -> './test_wavs/test_0_10.wav'
renamed './dataset_raw/test/test_0_11.wav' -> './test_wavs/test_0_11.wav'
renamed './dataset_raw/test/test_0_12.wav' -> './test_wavs/test_0_12.wav'
renamed './dataset_raw/test/test_0_13.wav' -> './test_wavs/test_0_13.wav'
renamed './dataset_raw/test/test_0_14.wav' -> './test_wavs/test_0_14.wav'
renamed './dataset_raw/test/test_0_15.wav' -> './test_wavs/test_0_15.wav'
renamed './dataset_raw/test/test_0_16.wav' -> './test_wavs/test_0_16.wav'
renamed './dataset_raw/test/test_0_17.wav' -> './test_wavs/test_0_17.wav'
renamed './dataset_raw/test/test_0_18.wav' -> './test_wavs/test_0_18.wav'
renamed './dataset_raw/test/test_0_19.wav' -> './test_wavs/test_0_19.wav'
renamed './dataset_raw/test/test_0_1.wav' -> './test_wavs/test_0_1.wav'
renamed './dataset_raw/test/test_0_2.wav' -> './test_wavs/test_0_2.wav'
renamed './dataset_raw/test/test_0_3.wav' ->

In [6]:
from modelscope.tools import run_auto_label

In [9]:
input_wav = "./test_wavs/"
output_data = "./output_training_data/"

ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.5")

2023-07-07 03:20:25,390 - modelscope - INFO - Use user-specified model revision: v1.0.5


---  There is this folder!  ---



  0%|          | 0/20 [00:00<?, ?it/s]


FileNotFoundError: ignored

## 训练 Training

In [15]:
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from modelscope.utils.audio.audio_utils import TtsTrainType

pretrained_model_id = 'damo/speech_personal_sambert-hifigan_nsf_tts_zh-cn_pretrain_16k'

dataset_id = "./output_training_data/"
pretrain_work_dir = "./pretrain_work_dir/"

# 训练信息，用于指定需要训练哪个或哪些模型，这里展示AM和Vocoder模型皆进行训练
# 目前支持训练：TtsTrainType.TRAIN_TYPE_SAMBERT, TtsTrainType.TRAIN_TYPE_VOC
# 训练SAMBERT会以模型最新step作为基础进行finetune
train_info = {
    TtsTrainType.TRAIN_TYPE_SAMBERT: {  # 配置训练AM（sambert）模型
        'train_steps': 202,               # 训练多少个step
        'save_interval_steps': 200,       # 每训练多少个step保存一次checkpoint
        'log_interval': 10               # 每训练多少个step打印一次训练日志
    }
}

# 配置训练参数，指定数据集，临时工作目录和train_info
kwargs = dict(
    model=pretrained_model_id,                  # 指定要finetune的模型
    model_revision = "v1.0.6",
    work_dir=pretrain_work_dir,                 # 指定临时工作目录
    train_dataset=dataset_id,                   # 指定数据集id
    train_type=train_info                       # 指定要训练类型及参数
)

trainer = build_trainer(Trainers.speech_kantts_trainer,
                        default_args=kwargs)

trainer.train()


2023-07-07 02:27:25,380 - modelscope - INFO - Use user-specified model revision: v1.0.6
2023-07-07 02:27:27,688 - modelscope - INFO - Use user-specified model revision: v1.0.6
2023-07-07 02:27:28,655 - modelscope - INFO - Set workdir to ./pretrain_work_dir/
2023-07-07 02:27:28,899 - modelscope - INFO - load ./output_training_data/
2023-07-07 02:27:29,531 - modelscope - INFO - Use user-specified model revision: v1.0.6
2023-07-07 02:27:34,139 - modelscope - INFO - am_config=./pretrain_work_dir/orig_model/basemodel_16k/sambert/config.yaml voc_config=./pretrain_work_dir/orig_model/basemodel_16k/hifigan/config.yaml
2023-07-07 02:27:34,140 - modelscope - INFO - audio_config=./pretrain_work_dir/orig_model/basemodel_16k/audio_config_se_16k.yaml
2023-07-07 02:27:34,142 - modelscope - INFO - am_ckpts=OrderedDict([(2400000, './pretrain_work_dir/orig_model/basemodel_16k/sambert/ckpt/checkpoint_2400000.pth')])
2023-07-07 02:27:34,145 - modelscope - INFO - voc_ckpts=OrderedDict([(2400000, './pretrai

## 推理 Inference

In [17]:
import os
from modelscope.models.audio.tts import SambertHifigan
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

model_dir = os.path.abspath("./pretrain_work_dir")

custom_infer_abs = {
    'voice_name':
    'F7',
    'am_ckpt':
    os.path.join(model_dir, 'tmp_am', 'ckpt'),
    'am_config':
    os.path.join(model_dir, 'tmp_am', 'config.yaml'),
    'voc_ckpt':
    os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
    'voc_config':
    os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan',
             'config.yaml'),
    'audio_config':
    os.path.join(model_dir, 'data', 'audio_config.yaml'),
    'se_file':
    os.path.join(model_dir, 'data', 'se', 'se.npy')
}
kwargs = {'custom_ckpt': custom_infer_abs}

model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs)

inference = pipeline(task=Tasks.text_to_speech, model=model_id)
output = inference(input="大家好呀，我是欧阳娜娜，欢迎使用滔滔智能的声音克隆产品！")

import IPython.display as ipd
ipd.Audio(output["output_wav"], rate=16000)


2023-07-07 02:32:01,122 - modelscope - INFO - am_config=/content/pretrain_work_dir/tmp_am/config.yaml voc_config=/content/pretrain_work_dir/orig_model/basemodel_16k/hifigan/config.yaml
2023-07-07 02:32:01,123 - modelscope - INFO - audio_config=/content/pretrain_work_dir/data/audio_config.yaml
2023-07-07 02:32:01,126 - modelscope - INFO - am_ckpts=OrderedDict([(2400000, '/content/pretrain_work_dir/tmp_am/ckpt/checkpoint_2400000.pth'), (2400200, '/content/pretrain_work_dir/tmp_am/ckpt/checkpoint_2400200.pth')])
2023-07-07 02:32:01,127 - modelscope - INFO - voc_ckpts=OrderedDict([(2400000, '/content/pretrain_work_dir/orig_model/basemodel_16k/hifigan/ckpt/checkpoint_2400000.pth')])
2023-07-07 02:32:01,128 - modelscope - INFO - se_path=/content/pretrain_work_dir/data/se/se.npy se_model_path=/content/pretrain_work_dir/orig_model/se.onnx
2023-07-07 02:32:01,129 - modelscope - INFO - mvn_path=/content/pretrain_work_dir/orig_model/mvn.npy


Removing weight norm...
