[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Isotr0py/Sakura-Subtitle/blob/main/whisper/Faster-Whisper-Colab.ipynb)

In [None]:
#@title 初始化环境
#@markdown 挂载Google网盘
from pathlib import Path
Mount_GDrive = True # @param {type:"boolean"}
if Mount_GDrive:
  from google.colab import drive

  drive.mount('/content/gdrive')
  ROOT_PATH = "/content/gdrive/MyDrive"
else:
  ROOT_PATH = "/content"

!nvidia-smi

Mounted at /content/gdrive
Mon Jan 22 14:20:00 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                         

In [None]:
#@title 安装faster-whisper
from IPython.display import clear_output
!pip install -q faster-whisper
!sudo apt-get update
!sudo apt install -qq nvidia-cuda-toolkit
clear_output()

In [None]:
#@title 设置工作区
#@markdown 音频文件存放文件夹
from pathlib import Path
from google.colab import files

work_dir = "Media/" # @param {type:"string"}
work_dir = Path(ROOT_PATH).joinpath(work_dir)
work_dir.mkdir(parents=True, exist_ok=True)
%cd $work_dir

#@markdown 上传文件至工作区
upload_file = False  # @param {type:"boolean"}
if upload_file:
  uploaded = files.upload()

/content/gdrive/MyDrive/Media


In [None]:
#@title 语音转录
from datetime import datetime
from glob import glob
from pathlib import Path

from faster_whisper import WhisperModel


#@markdown 模型设置
model_size = "large-v3" #@param ["large-v1", "large-v2", "large-v3"]
device = "cuda" #@param ["cpu", "cuda"]

#@markdown 转录设置
language = "ja" #@param ["auto", "en", "zh", "ja"]
language = None if language == "auto" else language
beam_size = 5 # @param {type:"integer"}
vad_filter = True #@param {type:"boolean"}

min_speech_duration_ms = 250 # @param {type:"integer"}
min_silence_duration_ms = 2000 # @param {type:"integer"}
vad_parameters = {
  "min_speech_duration_ms": min_speech_duration_ms,
  "min_silence_duration_ms": min_silence_duration_ms,
}

#@markdown 输出设置
output_dir = "./outputs/" #@param {type:"string"}
output_format = "vtt" #@param ["lrc", "vtt"]

# Run on GPU with FP16
model = WhisperModel(model_size, device=device)

files = glob("*.wav")+glob("*.mp3")+glob("*.flac")
for file in files:
  if output_format == "lrc":
    output_file = Path(output_dir).joinpath(f"{Path(file).stem}.{output_format}")
  if output_format == "vtt":
    output_file = Path(output_dir).joinpath(f"{Path(file).name}.{output_format}")
  Path(output_dir).mkdir(parents=True, exist_ok=True)

  segments, info = model.transcribe(file, language=language, beam_size=5, vad_filter=True, vad_parameters=vad_parameters)
  print("%s: Detected language '%s' with probability %f" % (file, info.language, info.language_probability))

  with open(output_file,"w") as f:
    if output_format == "lrc":
      f.write("[Transcribed by Whisper]\n")
    if output_format == "vtt":
      f.write("WEBVTT - Transcribed by Whisper\n\n")
    for segment in segments:
        text = segment.text
        start= datetime.fromtimestamp(segment.start).strftime("%H:%M:%S.%f")[:-3]
        end = datetime.fromtimestamp(segment.end).strftime("%H:%M:%S.%f")[:-3]
        print(f"[{start} --> {end}] {text}")
        if output_format == "lrc":
          start= datetime.fromtimestamp(segment.start).strftime("%M:%S.%f")[:-3]
          end = datetime.fromtimestamp(segment.end).strftime("%M:%S.%f")[:-3]
          f.write(f"[{start}]{text}\n[{end}]\n")
        if output_format == "vtt":
          start= datetime.fromtimestamp(segment.start).strftime("%H:%M:%S.%f")[:-3]
          end = datetime.fromtimestamp(segment.end).strftime("%H:%M:%S.%f")[:-3]
          f.write(f"{start} --> {end}\n{text}\n\n")