### **spk2info.pt 微调**

In [None]:
import os

from hyperpyyaml import load_hyperpyyaml  # type: ignore

import torch
import librosa  # type: ignore

from torchaudio import transforms  # type: ignore
from cosyvoice.cli.frontend import CosyVoiceFrontEnd  # type: ignore
from cosyvoice.utils.file_utils import load_wav  # type: ignore
from config_tts import TTSConfig

max_val = 0.8
prompt_sr, target_sr = 16000, 22050

def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db, frame_length=win_length, hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech


# 加载前端
TTS_DIR = os.path.expanduser(TTSConfig.MODELPATH)
MODEL_DIR = os.path.join(TTS_DIR, TTSConfig.MODEL)
with open(rf"{MODEL_DIR}/cosyvoice.yaml", "r") as f:
    configs = load_hyperpyyaml(f)

cosyvoice = CosyVoiceFrontEnd(
    configs["get_tokenizer"],
    configs["feat_extractor"],
    "{}/campplus.onnx".format(MODEL_DIR),
    "{}/speech_tokenizer_v1.onnx".format(MODEL_DIR),
    "{}/spk2info.pt".format(MODEL_DIR),
    configs["allowed_special"],
)

# 提取，写入特征
def add_spk(cosyvoice: CosyVoiceFrontEnd, wav_path: str, speaker_name: str):
    prompt_speech_16k = postprocess(load_wav(wav_path, prompt_sr))
    embedding = cosyvoice._extract_spk_embedding(prompt_speech_16k)

    cosyvoice.spk2info[speaker_name] = {
        "embedding": embedding,
    }

wav_paths = [r"D:\Segment_b\Code\code\swarmclone.cosyvoice\example\output\1.2n_-0.2x.mp3"]
speaker_names = ["知络_1.2"]

for wave_path, speaker_name in zip(wav_paths, speaker_names):
    print(f" * 正在添加 {speaker_name} 到 spk2info.pt")
    add_spk(cosyvoice, wave_path, speaker_name)

torch.save(cosyvoice.spk2info, r"C:\Users\11327\.swarmclone\tts_cosy_voice\CosyVoice-300M-Instruct\spk2info.pt")
spk2info = torch.load(r"C:\Users\11327\.swarmclone\tts_cosy_voice\CosyVoice-300M-Instruct\spk2info.pt")

for spk in speaker_names:
    if spk in spk2info.keys():
        print(f" * 添加 {spk} 成功！")

### **speaker interploration**

In [None]:
import torch

spk2info = torch.load(
    r"D:\Segment_b\Code\code\swarmclone.cosyvoice\example\spk2info_finetune\spk2info.pt"
)

mix_spk_1 = spk2info["neuro"]
mix_spk_2 = spk2info["xiaoling"]

mix_spk_1 = spk2info["zhiluo_1.1neuro_-0.1xiaoling"]
mix_spk_2 = spk2info["中文女"]

for ratio in [0.9]:
    spk2info[f"zhiluo_{ratio:.1f}mixer_0.9_{(1-ratio):.1f}nv"] = {}
    for (k, v_1), (_, v_2) in zip(mix_spk_1.items(), mix_spk_2.items()):
        spk2info[f"zhiluo_{ratio:.1f}mixer_0.9_{(1-ratio):.1f}nv"][k] = ratio * v_1 + (1 - ratio) * v_2
    
for spk in spk2info.keys():
    print(spk)
    for k, v in spk2info[spk].items():
        print(k, v.shape)
        
torch.save(spk2info, r"C:\Users\11327\.swarmclone\tts_cosy_voice\CosyVoice-300M-Instruct\spk2info.pt")


### **Rename Keys**

In [None]:
import torch

spk2info = torch.load(
    r"D:\Segment_b\Code\code\swarmclone.cosyvoice\example\spk2info_finetune\spk2info.pt"
)
for k in ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', 'neuro', 'xiaoling']:
    spk2info.pop(k)

for k in list(spk2info.keys()):
    if k not in ("zhiluo_0.7neuro_0.3xiaoling", "zhiluo_1.2neuro_-0.2xiaoling"):
        spk2info.pop(k)
    else:
        spk2info["知络" + k[6:10]] = spk2info.pop(k)
        
spk2info.keys()

torch.save(spk2info, r"D:\Segment_b\Code\code\swarmclone.cosyvoice\example\spk2info_finetune\mixer.pt")


In [None]:
import torch
torch.load(r"D:\Segment_b\Code\code\swarmclone.cosyvoice\example\spk2info_finetune\mixer.pt").keys()