# YouTube Speaker Diarization (Colab v2, faster-whisper)

- 带依赖缓存（`/content/cache`）
- 重启后可跳过重复下载依赖
- 支持可选双GPU并行（多任务场景）


## 1) 一键初始化（带缓存）
首次会下载并缓存 wheel；后续重启可跳过安装。


In [None]:
import os
from pathlib import Path

REPO_URL = 'https://github.com/Hana19951208/youtube-speaker-diarization.git'
REPO_DIR = '/content/youtube-speaker-diarization'
CACHE_DIR = Path('/content/cache')
WHEELHOUSE = CACHE_DIR / 'wheelhouse'
STAMP = CACHE_DIR / 'deps_installed_colab_v2.flag'

!apt-get update -y
!apt-get install -y ffmpeg

%cd /content
if os.path.exists(REPO_DIR):
    !rm -rf {REPO_DIR}
!git clone {REPO_URL}
%cd {REPO_DIR}

CACHE_DIR.mkdir(parents=True, exist_ok=True)
WHEELHOUSE.mkdir(parents=True, exist_ok=True)
os.environ['PIP_CACHE_DIR'] = str(CACHE_DIR / 'pip')
os.environ['HF_HOME'] = str(CACHE_DIR / 'huggingface')
os.environ['HF_HUB_CACHE'] = str(CACHE_DIR / 'huggingface' / 'hub')
os.environ['TORCH_HOME'] = str(CACHE_DIR / 'torch')

if STAMP.exists():
    print('✅ 检测到缓存标记，跳过依赖安装。')
else:
    print('⏬ 首次安装：下载并缓存 wheels ...')
    !pip download -q -r requirements.txt -d {WHEELHOUSE}
    !pip download -q torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 -d {WHEELHOUSE}

    !pip uninstall -y whisperx faster-whisper pyannote.audio transformers accelerate numpy pandas torch torchvision torchaudio -q
    !pip install -q --no-index --find-links {WHEELHOUSE} torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1
    !pip install -q --no-index --find-links {WHEELHOUSE} -r requirements.txt
    STAMP.write_text('ok')
    print('✅ 依赖安装完成并已缓存。')

print('⚠️ 建议现在 Runtime -> Restart runtime，然后从 1.1 继续。')


## 1.1) 重启后健康检查


In [None]:
import torch, numpy, pandas, transformers, accelerate, yt_dlp
from faster_whisper import WhisperModel
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)
print('accelerate:', accelerate.__version__)
print('numpy:', numpy.__version__)
print('pandas:', pandas.__version__)
print('yt-dlp:', yt_dlp.version.__version__)
print('cuda available:', torch.cuda.is_available())
print('gpu count:', torch.cuda.device_count())
_ = WhisperModel('tiny', device='cpu', compute_type='int8')
print('✅ import check passed')


## 2) 设置 HF_TOKEN


In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN', '')
print('HF_TOKEN set:', bool(HF_TOKEN))


## 3) 上传参考音频


In [None]:
ref_audio_path = '/content/youtube-speaker-diarization/biao.mp3'

print('ref_audio_path =', ref_audio_path)


## 4) 配置参数


In [None]:
CONFIG = {
    'youtube_url': 'https://www.youtube.com/watch?v=Zs8jUFaqtCI',
    'playlist_mode': 'single',
    'language': 'zh',
    'max_speakers': 3,
    'whisper_model': 'large-v3',
    'do_separation': False,
    'do_vad': False,
    'do_enhance': False,
    'similarity_threshold': 0.25,
    'output_dir': './output',
    'enable_dual_gpu_multi_task': True,
}
CONFIG


## 4.5) Step5前同步最新代码 + 补齐依赖


In [None]:
%cd /content/youtube-speaker-diarization
!git fetch origin
!git pull --rebase origin master || git pull origin master
print('✅ synced')


## 5) 单视频运行（默认）


In [None]:
from pipeline import YouTubeSpeakerPipeline
pipeline = YouTubeSpeakerPipeline(
    hf_token=HF_TOKEN,
    output_dir=CONFIG['output_dir'],
    whisper_model=CONFIG['whisper_model'],
    max_speakers=CONFIG['max_speakers'],
    do_separation=CONFIG['do_separation'],
    do_vad=CONFIG['do_vad'],
    do_enhance=CONFIG['do_enhance'],
    similarity_threshold=CONFIG['similarity_threshold'],
    playlist_mode=CONFIG['playlist_mode'],
)
results = pipeline.process(
    youtube_url=CONFIG['youtube_url'],
    ref_audio_path=ref_audio_path,
    language=CONFIG['language'],
)
print('✅ done')


## 6) 双GPU并行（可选，多任务时有效）
说明：单个视频的同一条流水线很难高效吃满2块GPU。
这个cell用于**多个URL并行**，每个进程绑定一张GPU。


In [None]:
import os, subprocess, torch
URLS = []  # 例如填2个视频链接，才能明显利用2块GPU
if torch.cuda.device_count() < 2:
    print('当前不足2块GPU，跳过')
elif not URLS:
    print('请先填 URLS 再运行')
else:
    procs=[]
    for i,u in enumerate(URLS):
        gpu=i%2
        env=os.environ.copy()
        env['CUDA_VISIBLE_DEVICES']=str(gpu)
        cmd=[
          'python','main.py',
          '--youtube_url',u,
          '--ref_audio',ref_audio_path,
          '--output_dir',f'./output_gpu{gpu}_{i}',
          '--language',CONFIG['language'],
          '--max_speakers',str(CONFIG['max_speakers']),
          '--whisper_model',CONFIG['whisper_model'],
          '--playlist_mode',CONFIG['playlist_mode'],
          '--no_separation'
        ]
        procs.append(subprocess.Popen(cmd, env=env))
    for p in procs:
        p.wait()
    print('✅ dual-gpu batch done')


## 7) 查看输出


In [None]:
import glob, os
for f in sorted(glob.glob('./output/*.srt') + glob.glob('./output/*.json')):
    print('-', f)
