# YouTube Speaker Diarization (Kaggle, faster-whisper版)

本版本使用 **faster-whisper + pyannote**，句级时间戳，重点保证 Kaggle 稳定性。


## 1) 一键初始化
运行后务必 **Restart Session**。


In [None]:
import os
REPO_URL = 'https://github.com/Hana19951208/youtube-speaker-diarization.git'
REPO_DIR = '/kaggle/working/youtube-speaker-diarization'
CACHE_DIR = '/kaggle/working/cache'

!apt-get update -y
!apt-get install -y ffmpeg

%cd /kaggle/working
if os.path.exists(REPO_DIR):
    !rm -rf {REPO_DIR}
!git clone {REPO_URL}
%cd {REPO_DIR}

# 清理关键冲突包
!pip uninstall -y whisperx faster-whisper pyannote.audio transformers accelerate numpy pandas torch torchvision torchaudio -q

# 先安装 torch 组合
!pip install -q --upgrade --force-reinstall torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1

# 安装项目依赖（已切换到 faster-whisper）
!pip install -q --upgrade --force-reinstall -r requirements.txt

os.makedirs(CACHE_DIR, exist_ok=True)
os.environ['HF_HOME'] = f'{CACHE_DIR}/huggingface'
os.environ['HF_HUB_CACHE'] = f'{CACHE_DIR}/huggingface/hub'
os.environ['TORCH_HOME'] = f'{CACHE_DIR}/torch'
os.environ['XDG_CACHE_HOME'] = f'{CACHE_DIR}/xdg'
print('✅ Init done. 现在请 Restart Session，再继续。')


## 1.1) 重启后健康检查


In [None]:
import torch, transformers, accelerate, numpy, pandas
from faster_whisper import WhisperModel
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)
print('accelerate:', accelerate.__version__)
print('numpy:', numpy.__version__)
print('pandas:', pandas.__version__)
_ = WhisperModel('tiny', device='cpu', compute_type='int8')
print('✅ faster-whisper import ok')


## 2) 设置 HF_TOKEN
你可以手填，或从 Kaggle Secrets 读取。


In [None]:
HF_TOKEN = ''  # 可直接粘贴；或留空后用 secrets
try:
    if not HF_TOKEN:
        from kaggle_secrets import UserSecretsClient
        HF_TOKEN = UserSecretsClient().get_secret('HF_TOKEN')
except Exception:
    pass
import os
os.environ['HF_TOKEN'] = HF_TOKEN
print('HF_TOKEN set:', bool(HF_TOKEN))


## 3) 上传参考音频


In [None]:
from IPython.display import display
import ipywidgets as widgets
uploader = widgets.FileUpload(accept='.wav,.mp3,.m4a', multiple=False)
display(uploader)
print('上传后运行下一格保存文件')


In [None]:
ref_audio_path = None
if uploader.value:
    for fname, fmeta in uploader.value.items():
        ref_audio_path = f'/kaggle/working/{fname}'
        with open(ref_audio_path, 'wb') as f:
            f.write(fmeta['content'])
        break
print('ref_audio_path =', ref_audio_path)


## 4) 配置参数


In [None]:
CONFIG = {
    'youtube_url': 'https://www.youtube.com/watch?v=Zs8jUFaqtCI',
    'playlist_mode': 'single',  # single | all
    'language': 'zh',
    'max_speakers': 3,
    'whisper_model': 'large-v3',
    'do_separation': False,
    'do_vad': False,
    'do_enhance': False,
    'similarity_threshold': 0.25,
    'output_dir': './output',
}
CONFIG


## 5) 运行 Pipeline


In [None]:
from pipeline import YouTubeSpeakerPipeline

pipeline = YouTubeSpeakerPipeline(
    hf_token=HF_TOKEN,
    output_dir=CONFIG['output_dir'],
    whisper_model=CONFIG['whisper_model'],
    max_speakers=CONFIG['max_speakers'],
    do_separation=CONFIG['do_separation'],
    do_vad=CONFIG['do_vad'],
    do_enhance=CONFIG['do_enhance'],
    similarity_threshold=CONFIG['similarity_threshold'],
    playlist_mode=CONFIG.get('playlist_mode', 'single'),
)

results = pipeline.process(
    youtube_url=CONFIG['youtube_url'],
    ref_audio_path=ref_audio_path,
    language=CONFIG['language'],
)
print('✅ Done')


## 6) 查看输出


In [None]:
import glob, os
files = glob.glob(os.path.join(CONFIG['output_dir'], '*.srt')) + glob.glob(os.path.join(CONFIG['output_dir'], '*.json'))
for f in files:
    print('-', f)
