In [None]:
import os
import sys
import shutil
import warnings

from time import time

import playsound
import torchaudio

from config_tts import TTSConfig
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav
sys.path.insert(0, os.path.abspath(os.path.join("..")))
from mfa.align import download_model_and_dict, init_mfa_models, align

if __name__ == "__main__":
    warnings.filterwarnings("ignore", message=".*LoRACompatibleLinear.*")
    warnings.filterwarnings("ignore", message=".*torch.nn.utils.weight_norm is deprecated.*")
    warnings.filterwarnings("ignore", category=FutureWarning, message=r".*weights_only=False.*")
    warnings.filterwarnings("ignore", category=FutureWarning, message=r".*weights_norm.*")

    try:
        model_path = os.path.expanduser(
            os.path.join(TTSConfig.MODELPATH, TTSConfig.MODEL)
        )
        cosyvoice = CosyVoice(model_path, fp16=TTSConfig.FLOAT16)
        print(cosyvoice.list_available_spks())
    except Exception as e:
        err_msg = str(e).lower()
        if ("file" in err_msg) and ("doesn't" in err_msg) and ("exist" in err_msg):
            catch = input(
                " * CosyVoice TTS 发生了错误，这可能是由于模型下载不完全导致的，是否清理缓存TTS模型？[y/n] "
            )
            if catch.strip().lower() == "y":
                shutil.rmtree(
                    os.path.expanduser(TTSConfig.MODELPATH), ignore_errors=True
                )
                print(" * 清理完成，请重新运行该模块。")
                sys.exit(0)
            else:
                raise
        else:
            raise

    mfa_dir = os.path.expanduser(os.path.join(TTSConfig.MODELPATH, "mfa"))
    if not (
        os.path.exists(mfa_dir)
        and os.path.exists(os.path.join(mfa_dir, "mandarin_china_mfa.dict"))
        and os.path.exists(os.path.join(mfa_dir, "mandarin_mfa.zip"))
    ):
        print(" * SwarmClone 使用 Montreal Forced Aligner 进行对齐，开始下载: ")
        download_model_and_dict(TTSConfig)
    acoustic_model, lexicon_compiler, tokenizer, pretrained_aligner = init_mfa_models(TTSConfig)

In [None]:
s = "这是一段测试的语音，用来体验在说话人微调下的表现效果，你喜欢吗"
emotions = {
    "喜爱": 0.5,
    "悲伤": 0.1,
    "厌恶": 0.1,
    "愤怒": 0.1,
    "高兴": 0.1,
    "无情绪": 0.1,
}

s = s.strip()
# 带感情提示
tune = "知络_1.2"
if emotions["无情绪"] < 0.3:
    emotions_top2 = sorted(emotions.items(), key=lambda x: x[1], reverse=True)[:2]
    # prompts
    # prompt =  "With a tone of slightly happy"
    # prompt =  "With a tone of sad and cry"
    # prompt =  "With a tone of huge disgust"
    # prompt =  "With a tone of strong anger"
    
    # prompt =  "With a tone of disgust and anger"
    # prompt =  "With a tone of disgust and sad"
    prompt =  "With a tone of sad and anger"
    
    outputs = list(cosyvoice.inference_instruct(s, tune, prompt, stream=False))[0]["tts_speech"]
    
    audio_name = os.path.join("output", f"{tune}_{prompt}.mp3".replace("<|endofprompt|>", "").replace(" ", ""))
# 无感情提示
else:
    outputs = list(cosyvoice.inference_sft(s, tune, stream=False))[0]["tts_speech"]
    audio_name = os.path.join("output", f"{TTSConfig.SFT_TUNE}_sft.mp3")
# 音频文件
torchaudio.save(audio_name, outputs, 22050)
# 字幕文件
# txt_name = audio_name.replace(".mp3", ".txt")
# if "<|endofprompt|>" in s:
#     open(txt_name, "w", encoding="utf-8").write(s.split("<|endofprompt|>")[1])
# else:
#     open(txt_name, "w", encoding="utf-8").write(s)
# 对齐文件
# align(
#     audio_name,
#     txt_name,
#     acoustic_model,
#     lexicon_compiler,
#     tokenizer,
#     pretrained_aligner,
# )
# align_name = audio_name.replace(".mp3", ".TextGrid")

playsound.playsound(audio_name)
# 删除文件
# os.remove(audio_name)
# os.remove(txt_name)
# os.remove(align_name)

In [None]:
import torch
ins = torch.load(r"D:\Segment_b\Code\.swarmclone\tts_cosy_voice\CosyVoice-300M-Instruct\spk2info.pt")
sft = torch.load(r"D:\Segment_b\Code\.swarmclone\tts_cosy_voice\CosyVoice-300M-SFT\spk2info.pt")

ins.keys(), sft.keys()

In [None]:
import torch
import torchaudio

def enhance_high_freq(waveform: torch.Tensor, window_size: int = 3, alpha: float = 0.5):
    """
    增强音频的高频成分
    Args:
        waveform: 输入音频张量，形状为(channels, samples)
        window_size: 移动平均窗口大小（奇数）
        alpha: 高频增益系数（0-1）
    Returns:
        处理后的音频张量
    """
    channels, samples = waveform.shape
    
    # 创建移动平均核（低通滤波）
    kernel = torch.ones(1, 1, window_size) / window_size
    moving_avg = torch.nn.functional.conv1d(
        waveform.unsqueeze(1), 
        kernel, 
        padding=window_size//2
    ).squeeze(1)
    
    # 提取高频成分并增强
    high_freq = waveform - moving_avg
    enhanced = waveform + alpha * high_freq
    
    # 限幅防止溢出
    return torch.clamp(enhanced, -1.0, 1.0)

# 使用示例
waveform, sample_rate = torchaudio.load("D:\Segment_b\Code\code\swarmclone.cosyvoice\example\output\知络_1.2_sft.mp3")

# 处理音频（窗口大小3，高频增益0.7）
processed = enhance_high_freq(waveform, window_size=3, alpha=-0.5)

# 保存结果
torchaudio.save("output.mp3", processed, sample_rate, format="mp3")