# N46Whisper TTS

**文本转语音工具 / Text-to-Speech Tool**

此工具将文本内容转换为语音音频文件，支持中文和日语。

This tool converts text content to speech audio files, supporting Chinese and Japanese.

---

**使用步骤 / Usage Steps:**
1. 设置 TTS 参数
2. 安装依赖
3. 运行 TTS

**【重要】** 务必在"修改"->"笔记本设置"->"硬件加速器"中选择GPU！

**【IMPORTANT】** Make sure you select GPU as hardware accelerator in notebook settings!

In [None]:
#@title **1. TTS 参数设置 / TTS Settings**

# @markdown **选择语音语言 / Select Voice Language**
tts_language = "Chinese"  # @param ["Chinese", "Japanese"]

# @markdown **选择语音类型 / Select Voice Type**
tts_voice_type = "Female"  # @param ["Female", "Male"]

# @markdown **输出音频格式 / Output Audio Format**
tts_output_format = "wav"  # @param ["wav", "mp3"]

# @markdown **语速调节 / Speech Rate (0.5-2.0)**
tts_speed = 1.0  # @param {type:"slider", min:0.5, max:2.0, step:0.1}

print(f"TTS 配置完成 / TTS Settings configured:")
print(f"  语言/Language: {tts_language}")
print(f"  语音类型/Voice: {tts_voice_type}")
print(f"  输出格式/Format: {tts_output_format}")
print(f"  语速/Speed: {tts_speed}")

In [None]:
#@title **2. 安装 TTS 依赖 / Install TTS Dependencies**

# @markdown <font size="2">安装 TTS 模型所需的依赖库</font>
# @markdown <br/><font size="2">Install dependencies required for TTS model</font>

!pip install -q TTS
!pip install -q pydub
!apt-get install -q -y ffmpeg

from IPython.display import clear_output
clear_output()

print("TTS 依赖安装完成 / TTS dependencies installed successfully!")
print("请继续执行下一个单元格 / Please continue to the next cell")

In [None]:
#@title **3. 运行 TTS / Run TTS**

# @markdown <font size="2">上传文本文件并执行文本转语音转换</font>
# @markdown <br/><font size="2">Upload text file and execute text-to-speech conversion</font>

import os
import re
import torch
from pathlib import Path
from tqdm import tqdm
from google.colab import files
from IPython.display import clear_output, Audio, display

# 上传文本文件
print("请上传文本文件 (.txt, .srt, .ass) / Please upload text file (.txt, .srt, .ass)")
uploaded = files.upload()
tts_input_file = list(uploaded.keys())[0]
tts_basename = Path(tts_input_file).stem

clear_output()
print(f"已加载文件 / File loaded: {tts_input_file}")

# 读取并预处理文本
def read_text_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    ext = Path(filepath).suffix.lower()
    
    if ext == '.srt':
        lines = content.split('\n')
        text_lines = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            if line.isdigit():
                continue
            if '-->' in line:
                continue
            text_lines.append(line)
        return ' '.join(text_lines)
    
    elif ext == '.ass':
        lines = content.split('\n')
        text_lines = []
        for line in lines:
            if line.startswith('Dialogue:'):
                parts = line.split(',', 9)
                if len(parts) >= 10:
                    text = parts[9]
                    text = re.sub(r'\{[^}]*\}', '', text)
                    text = text.replace('\\N', ' ').replace('\\n', ' ')
                    text_lines.append(text.strip())
        return ' '.join(text_lines)
    
    else:
        return content

text_content = read_text_file(tts_input_file)
print(f"文本长度 / Text length: {len(text_content)} 字符/characters")

# 文本分段处理
def split_text(text, max_length=200):
    sentences = re.split(r'([。！？.!?])', text)
    segments = []
    current_segment = ""
    
    for i in range(0, len(sentences)-1, 2):
        sentence = sentences[i] + (sentences[i+1] if i+1 < len(sentences) else '')
        if len(current_segment) + len(sentence) <= max_length:
            current_segment += sentence
        else:
            if current_segment:
                segments.append(current_segment)
            current_segment = sentence
    
    if current_segment:
        segments.append(current_segment)
    
    if not segments:
        for i in range(0, len(text), max_length):
            segments.append(text[i:i+max_length])
    
    return segments

text_segments = split_text(text_content)
print(f"分段数量 / Number of segments: {len(text_segments)}")

# 加载 TTS 模型
print("加载 TTS 模型 / Loading TTS model...")
from TTS.api import TTS

if tts_language == "Chinese":
    tts_model_name = "tts_models/zh-CN/baker/tacotron2-DDC-GST"
elif tts_language == "Japanese":
    tts_model_name = "tts_models/ja/kokoro/tacotron2-DDC"
else:
    tts_model_name = "tts_models/en/ljspeech/tacotron2-DDC"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"使用设备 / Using device: {device}")

tts = TTS(model_name=tts_model_name, progress_bar=True).to(device)

clear_output()
print(f"TTS 模型加载完成 / TTS model loaded: {tts_model_name}")

# 生成音频
print("生成音频中 / Generating audio...")
audio_segments = []
output_dir = "tts_output"
os.makedirs(output_dir, exist_ok=True)

for i, segment in enumerate(tqdm(text_segments, desc="TTS Progress")):
    if not segment.strip():
        continue
    
    segment_file = f"{output_dir}/segment_{i:04d}.wav"
    try:
        tts.tts_to_file(text=segment, file_path=segment_file, speed=tts_speed)
        audio_segments.append(segment_file)
    except Exception as e:
        print(f"段落 {i} 生成失败 / Segment {i} failed: {e}")
        continue

print(f"生成了 {len(audio_segments)} 个音频段落 / Generated {len(audio_segments)} audio segments")

# 合并音频
from pydub import AudioSegment

print("合并音频文件 / Merging audio files...")
combined = AudioSegment.empty()

for seg_file in audio_segments:
    audio = AudioSegment.from_wav(seg_file)
    combined += audio

# 导出最终音频
output_filename = f"{tts_basename}_tts.{tts_output_format}"
if tts_output_format == "mp3":
    combined.export(output_filename, format="mp3", bitrate="192k")
else:
    combined.export(output_filename, format="wav")

print(f"音频生成完成 / Audio generation complete: {output_filename}")

# 显示音频预览
print("音频预览 / Audio preview:")
display(Audio(output_filename))

# 清理临时文件
import shutil
shutil.rmtree(output_dir, ignore_errors=True)

# 触发下载
files.download(output_filename)

print("TTS 转换完成！/ TTS conversion complete!")