# YouTube Speaker Diarization Pipeline (Kaggle Edition)

这个版本专门给 Kaggle Notebook 使用，不影响 Colab 版本。


## 1) 一键初始化（Kaggle）


In [None]:
import os, sys

REPO_URL = 'https://github.com/Hana19951208/youtube-speaker-diarization.git'
REPO_DIR = '/kaggle/working/youtube-speaker-diarization'
CACHE_DIR = '/kaggle/working/cache'

!apt-get update -y
!apt-get install -y ffmpeg

%cd /kaggle/working
if os.path.exists(REPO_DIR):
    !rm -rf {REPO_DIR}
!git clone {REPO_URL}
%cd {REPO_DIR}

# Clean conflicting stack from base Kaggle image
!pip uninstall -y torch torchvision torchaudio whisperx transformers accelerate pyannote.audio numpy -q

# Stable pinned stack (important: force-reinstall to avoid partial package mix)
!pip install -q --upgrade --force-reinstall torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1
!pip install -q --upgrade --force-reinstall numpy==1.26.4 transformers==4.46.3 accelerate==0.34.2 pyannote.audio==3.1.1 whisperx==3.1.1
!pip install -q --upgrade --force-reinstall -r requirements.txt

os.makedirs(CACHE_DIR, exist_ok=True)
os.environ['HF_HOME'] = f'{CACHE_DIR}/huggingface'
os.environ['TRANSFORMERS_CACHE'] = f'{CACHE_DIR}/huggingface/transformers'
os.environ['HF_HUB_CACHE'] = f'{CACHE_DIR}/huggingface/hub'
os.environ['TORCH_HOME'] = f'{CACHE_DIR}/torch'
os.environ['XDG_CACHE_HOME'] = f'{CACHE_DIR}/xdg'
os.environ['PIP_CACHE_DIR'] = f'{CACHE_DIR}/pip'

print('✅ Kaggle init done. 必须 Restart Session 后继续。')


## 1.1) 环境健康检查（重启后运行）


In [None]:
import torch, transformers, accelerate, whisperx, numpy
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)
print('accelerate:', accelerate.__version__)
print('whisperx:', whisperx.__version__ if hasattr(whisperx, '__version__') else 'unknown')
print('numpy:', numpy.__version__)
assert torch.__version__.startswith('2.5.1')
assert transformers.__version__.startswith('4.46.3')
assert accelerate.__version__.startswith('0.34.2')
print('✅ Version check passed')


## 2) 设置 HF_TOKEN
先在 HuggingFace 接受 pyannote/speaker-diarization-3.1 的协议。


In [None]:
HF_TOKEN = ''  # 填你的 token
import os
os.environ['HF_TOKEN'] = HF_TOKEN
print('HF_TOKEN set:', bool(HF_TOKEN))


## 3) 上传参考音频


In [None]:
from IPython.display import display
import ipywidgets as widgets

uploader = widgets.FileUpload(accept='.wav,.mp3', multiple=False)
display(uploader)
print('上传完成后运行下一格保存文件。')


In [None]:
import os
ref_audio_path = None
if uploader.value:
    for fname, fmeta in uploader.value.items():
        out = f'/kaggle/working/{fname}'
        with open(out, 'wb') as fp:
            fp.write(fmeta['content'])
        ref_audio_path = out
        break
print('ref_audio_path =', ref_audio_path)


## 4) 配置参数


In [None]:
CONFIG = {
    'youtube_url': 'https://www.youtube.com/watch?v=Zs8jUFaqtCI',
    'playlist_mode': 'single',  # single | all
    'language': 'zh',
    'max_speakers': 3,
    'whisper_model': 'large-v3',
    'do_separation': False,
    'do_vad': False,
    'do_enhance': False,
    'similarity_threshold': 0.25,
    'output_dir': './output',
}
CONFIG


## 5) 运行 Pipeline


In [None]:
from pipeline import YouTubeSpeakerPipeline

pipeline = YouTubeSpeakerPipeline(
    hf_token=HF_TOKEN,
    output_dir=CONFIG['output_dir'],
    whisper_model=CONFIG['whisper_model'],
    max_speakers=CONFIG['max_speakers'],
    do_separation=CONFIG['do_separation'],
    do_vad=CONFIG['do_vad'],
    do_enhance=CONFIG['do_enhance'],
    similarity_threshold=CONFIG['similarity_threshold'],
    playlist_mode=CONFIG.get('playlist_mode','single'),
)

results = pipeline.process(
    youtube_url=CONFIG['youtube_url'],
    ref_audio_path=ref_audio_path,
    language=CONFIG['language'],
)

print('\n' + '='*60)
print('PROCESSING COMPLETE!')
print('='*60)


## 6) 导出结果


In [None]:
import os, glob
files = glob.glob(os.path.join(CONFIG['output_dir'], '*.srt')) + glob.glob(os.path.join(CONFIG['output_dir'], '*.json'))
print('Output files:')
for f in files:
    print('-', f)
