In [None]:
pip install datasets torch numpy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!pip install transformers datasets torchaudio soundfile
!pip install git+https://github.com/huggingface/transformers.git  # 确保最新版

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-8uo_ax00
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-8uo_ax00
  Resolved https://github.com/huggingface/transformers.git to commit 51d732709e5ae424e8fb6c4e58b72057a3e413c2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.53.0.dev0-py3-none-any.whl size=11363407 sha256=84db875625ed4ee6fc2684f44b7305ec436822a45988202f755d292281f3ecda
  Stored in directory: /tmp/pip-ephem-wheel-cache-c2g6flaw/wheels/32/4b/78/f195c684dd3a9ed21f3b39fe8f85b48df7918581b6437be143
Successfully b

In [11]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf
from IPython.display import Audio
import numpy as np
import os
import zipfile
import requests

# 1. 清理缓存
!rm -rf ~/.cache/huggingface/datasets

# 2. 下载并解压数据集（确保完整下载）
dataset_url = "https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors/resolve/main/spkrec-xvect.zip"
download_path = "spkrec-xvect.zip"
extract_path = "embeddings"

# 创建目录
os.makedirs(extract_path, exist_ok=True)

# 下载文件（显示进度）
if not os.path.exists(download_path):
    print("正在下载数据集...")
    response = requests.get(dataset_url, stream=True)
    total_size = int(response.headers.get('content-length', 0))

    with open(download_path, 'wb') as f:
        downloaded = 0
        for data in response.iter_content(chunk_size=1024):
            downloaded += len(data)
            f.write(data)
            # 显示进度
            print(f"下载进度: {downloaded/total_size*100:.1f}%", end='\r')
    print("\n下载完成!")

# 解压文件
if not os.listdir(extract_path):
    print("正在解压文件...")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("解压完成!")

# 3. 获取嵌入文件列表
embedding_files = []
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith(".npy"):
            embedding_files.append(os.path.join(root, file))

# 检查文件数量
if len(embedding_files) == 0:
    raise RuntimeError("未找到任何嵌入文件，请检查下载和解压过程")

print(f"找到 {len(embedding_files)} 个说话人嵌入文件")

# 4. 加载模型
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")



找到 7931 个说话人嵌入文件


In [29]:
# 尝试第100个说话人（确保索引小于文件总数）
new_index = min(200, len(embedding_files)-1)  # 安全索引
sample_file = embedding_files[new_index]

# 重新加载嵌入
speaker_embedding = torch.tensor(np.load(sample_file)).unsqueeze(0)

In [30]:

# 6. 生成语音
text = "Memento mori!!!"

# 文本预处理
inputs = processor(text=text, return_tensors="pt")

# 生成语音（使用GPU加速）
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
vocoder = vocoder.to(device)


with torch.no_grad():
    speech = model.generate_speech(
        inputs["input_ids"].to(device),
        speaker_embeddings=speaker_embedding,
        vocoder=vocoder
    )

# 7. 保存并播放结果
speech_np = speech.cpu().numpy().squeeze()
sf.write("output.wav", speech_np, samplerate=16000)
print("\n生成完成! 播放音频:")
Audio("output.wav", autoplay=True)



生成完成! 播放音频:


In [31]:
import os

# 设置从https://hf-mirror.com下载模型，否则会从huggingface.co下载
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

import torch
import librosa # pip install librosa
from transformers import pipeline

# 加载音频文件
audio, sample_rate = librosa.load("/content/output.wav")

# 处理音频数据
pipeline = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-small",
)

# 输出结果
result = pipeline(audio)
print(result)

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Device set to use cpu
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


{'text': ' Memento Mori.'}
