# N46Whisper TTS (CosyVoice)

**文本转语音工具 / Text-to-Speech Tool**

基于阿里通义实验室 CosyVoice 2.0 语音合成大模型

Based on Alibaba Tongyi Lab CosyVoice 2.0 Speech Synthesis Model

---

**支持语言 / Supported Languages:** 中文、英语、日语、韩语、粤语等

**使用步骤 / Usage Steps:**
1. 安装依赖并下载模型（首次运行约需 5-10 分钟）
2. 设置 TTS 参数
3. 运行 TTS

**【重要】** 请在"修改"->"笔记本设置"->"硬件加速器"中选择 **GPU (T4)**

In [None]:
#@title **1. 安装依赖并下载模型 / Install Dependencies & Download Model**

# @markdown <font size="2">首次运行需要下载模型（约 1GB），请耐心等待</font>
# @markdown <br/><font size="2">First run requires model download (~1GB), please wait patiently</font>

import os

# 克隆 CosyVoice 仓库
if not os.path.exists('CosyVoice'):
    !git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
    %cd CosyVoice
    !git submodule update --init --recursive
else:
    %cd CosyVoice

# 安装依赖
!pip install -q -r requirements.txt
!pip install -q modelscope pydub

# 安装系统依赖
!apt-get install -q -y sox libsox-dev

# 下载模型
from modelscope import snapshot_download
import os

model_dir = 'pretrained_models/CosyVoice2-0.5B'
if not os.path.exists(model_dir):
    snapshot_download('iic/CosyVoice2-0.5B', local_dir=model_dir)

from IPython.display import clear_output
clear_output()

print("✅ 依赖安装完成，模型下载完成！")
print("✅ Dependencies installed and model downloaded!")
print("请继续执行下一个单元格 / Please continue to the next cell")

In [None]:
#@title **2. TTS 参数设置 / TTS Settings**

# @markdown **选择语音语言 / Select Voice Language**
tts_language = "中文"  # @param ["中文", "英语", "日语", "韩语", "粤语"]

# @markdown **输出音频格式 / Output Audio Format**
tts_output_format = "wav"  # @param ["wav", "mp3"]

# @markdown **使用指令模式 / Use Instruct Mode**
# @markdown <font size="2">可添加情感、语速等指令，如"用开心的语气说"</font>
use_instruct = False  # @param {type:"boolean"}

# @markdown **指令内容 / Instruction (if instruct mode enabled)**
instruct_text = "用温柔的语气说"  # @param {type:"string"}

print(f"TTS 配置完成 / TTS Settings configured:")
print(f"  语言/Language: {tts_language}")
print(f"  输出格式/Format: {tts_output_format}")
print(f"  指令模式/Instruct: {use_instruct}")
if use_instruct:
    print(f"  指令/Instruction: {instruct_text}")

In [None]:
#@title **3. 运行 TTS / Run TTS**

# @markdown <font size="2">上传文本文件并执行文本转语音转换</font>
# @markdown <br/><font size="2">Upload text file and execute text-to-speech conversion</font>

import os
import re
import sys
from pathlib import Path
from google.colab import files
from IPython.display import clear_output, Audio, display

# 确保在 CosyVoice 目录
if not os.path.exists('cosyvoice'):
    os.chdir('CosyVoice')

sys.path.insert(0, '.')
sys.path.insert(0, 'third_party/Matcha-TTS')

# 上传文本文件
print("请上传文本文件 (.txt, .srt, .ass) / Please upload text file (.txt, .srt, .ass)")
uploaded = files.upload()
tts_input_file = list(uploaded.keys())[0]
tts_basename = Path(tts_input_file).stem

clear_output()
print(f"已加载文件 / File loaded: {tts_input_file}")

# 读取并预处理文本
def read_text_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    ext = Path(filepath).suffix.lower()
    
    if ext == '.srt':
        lines = content.split('\n')
        text_lines = []
        for line in lines:
            line = line.strip()
            if not line or line.isdigit() or '-->' in line:
                continue
            text_lines.append(line)
        return ' '.join(text_lines)
    
    elif ext == '.ass':
        lines = content.split('\n')
        text_lines = []
        for line in lines:
            if line.startswith('Dialogue:'):
                parts = line.split(',', 9)
                if len(parts) >= 10:
                    text = parts[9]
                    text = re.sub(r'\{[^}]*\}', '', text)
                    text = text.replace('\\N', ' ').replace('\\n', ' ')
                    text_lines.append(text.strip())
        return ' '.join(text_lines)
    
    else:
        return content

text_content = read_text_file(tts_input_file)
print(f"文本长度 / Text length: {len(text_content)} 字符/characters")

# 加载 CosyVoice 模型
print("加载 CosyVoice 模型 / Loading CosyVoice model...")
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
import torchaudio

cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False)

clear_output()
print("✅ 模型加载完成 / Model loaded!")

# 文本分段处理
def split_text(text, max_length=300):
    sentences = re.split(r'([。！？.!?\n])', text)
    segments = []
    current_segment = ""
    
    for i in range(0, len(sentences)-1, 2):
        sentence = sentences[i] + (sentences[i+1] if i+1 < len(sentences) else '')
        if len(current_segment) + len(sentence) <= max_length:
            current_segment += sentence
        else:
            if current_segment.strip():
                segments.append(current_segment.strip())
            current_segment = sentence
    
    if current_segment.strip():
        segments.append(current_segment.strip())
    
    if not segments:
        for i in range(0, len(text), max_length):
            segments.append(text[i:i+max_length])
    
    return segments

text_segments = split_text(text_content)
print(f"分段数量 / Number of segments: {len(text_segments)}")

# 生成音频
print("生成音频中 / Generating audio...")
import torch
from tqdm import tqdm

audio_segments = []
sample_rate = 24000

for i, segment in enumerate(tqdm(text_segments, desc="TTS Progress")):
    if not segment.strip():
        continue
    
    try:
        # 使用 zero-shot 模式生成
        for j, result in enumerate(cosyvoice.inference_zero_shot(
            segment, 
            "希望你以后能够做的比我还好呦。",  # 参考文本
            load_wav('asset/zero_shot_prompt.wav', sample_rate),  # 参考音频
            stream=False
        )):
            audio_segments.append(result['tts_speech'])
            break  # 只取第一个结果
    except Exception as e:
        print(f"段落 {i} 生成失败 / Segment {i} failed: {e}")
        continue

print(f"生成了 {len(audio_segments)} 个音频段落 / Generated {len(audio_segments)} audio segments")

# 合并音频
print("合并音频文件 / Merging audio files...")
combined_audio = torch.cat(audio_segments, dim=1)

# 保存音频
output_filename = f"{tts_basename}_tts.wav"
torchaudio.save(output_filename, combined_audio, sample_rate)

# 转换格式（如果需要 mp3）
if tts_output_format == "mp3":
    from pydub import AudioSegment
    audio = AudioSegment.from_wav(output_filename)
    mp3_filename = f"{tts_basename}_tts.mp3"
    audio.export(mp3_filename, format="mp3", bitrate="192k")
    os.remove(output_filename)
    output_filename = mp3_filename

print(f"音频生成完成 / Audio generation complete: {output_filename}")

# 显示音频预览
print("音频预览 / Audio preview:")
display(Audio(output_filename))

# 触发下载
files.download(output_filename)

print("✅ TTS 转换完成！/ TTS conversion complete!")