<a href="https://colab.research.google.com/github/sgnoob/subtitles/blob/main/subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to use
* https://github.com/sgnoob/subtitles?tab=readme-ov-file#how-to-use-the-colab-file


# Settings

In [None]:
# @markdown # Input
INPUT_SOURCE = "yt-dlp" # @param ["yt-dlp", "GoogleDrive"]

# @markdown ### YouTube Only
VIDEO_URL ='https://www.youtube.com/watch?v=ERIxQaoEbXs' #@param {type:"string"}
COPY_VIDEO_TO_DRIVE = True # @param {type:"boolean"}

# @markdown ### Google Drive Only
# @markdown Please don't use names with special characters such as space.
INPUT_FILE = "input/test.mp4" #@param {type:"string"}

# @markdown ---
# @markdown # Output
OUTPUT_DIR = 'output' #@param {type:"string"}

# @markdown ---
# @markdown # Mel-Band-Roformer Settings
MEL_SEGMENT_TIME = 3600 #@param {type:"integer"}

# @markdown ---
# @markdown # Whisper Settings
MODEL = 'large-v2' #@param {type:"string"}
VAD_METHOD = 'pyannote_v3' #@param ["silero_v4_fw", "silero_v5_fw", "silero_v3", "silero_v4", "silero_v5", "pyannote_v3", "pyannote_onnx_v3", "auditok", "webrtc"]
VAD_THRESHOLD = 0.1 #@param {type:"number"}
# VAD_THRESHOLD default is 0.45

# @markdown ---
# @markdown # Gemini Keys
GEMINI_API_KEY_1 = 'NONE' #@param {type:"string"}
GEMINI_API_KEY_2 = 'NONE' #@param {type:"string"}

from google.colab import userdata
try:
  GEMINI_API_KEY_1 = userdata.get('GEMINI_API_KEY_1')
  print("Obtained GEMINI_API_KEY_1 from Colab Secrets.")
except:
  pass
try:
  GEMINI_API_KEY_2 = userdata.get('GEMINI_API_KEY_2')
  print("Obtained GEMINI_API_KEY_2 from Colab Secrets.")
except:
  pass

GEMINI_API_KEY_1= GEMINI_API_KEY_1.strip()
GEMINI_API_KEY_2= GEMINI_API_KEY_2.strip()

if len(GEMINI_API_KEY_1) == 0 or GEMINI_API_KEY_1.lower() == "none":
  GEMINI_API_KEY_1 = None
if len(GEMINI_API_KEY_2) == 0 or GEMINI_API_KEY_2.lower() == "none":
  GEMINI_API_KEY_2 = None

if GEMINI_API_KEY_1 is None:
  print("ERROR. GEMINI_API_KEY_1 is empty. How are we going to translate?")

# @markdown ---
# @markdown # Others
DELETE_INTERMEDIATE_FILES = True # @param {type:"boolean"}


# Internal Use only
MOUNT_DIR = '/content/drive'
INPUT_FILE = f"{MOUNT_DIR}/MyDrive/{INPUT_FILE}"
OUTPUT_DIR = f"{MOUNT_DIR}/MyDrive/{OUTPUT_DIR}"

EXEC_DIR = '/content/exec'
TMP_DIR = '/content/tmp'
! mkdir -p {EXEC_DIR}
if DELETE_INTERMEDIATE_FILES:
  ! rm -rf {TMP_DIR}
! mkdir -p {TMP_DIR}

# Mount Google Drive



In [None]:
from google.colab import drive
drive.mount(MOUNT_DIR)

! mkdir -p {OUTPUT_DIR}

# Install libraries

In [None]:
import os
import base64

os.environ['MPLBACKEND'] = 'Agg' # Copied from https://github.com/Purfview/whisper-standalone-win/discussions/385

! apt-get install -y aria2
! apt-get install -y ffmpeg
! pip3 install -U gemini-srt-translator pysubs2 yt-dlp

# Download Faster-Whisper-XXL
# I find the latest 7zip to be slightly faster in decompressing compared to apt-get
if not os.path.exists(f'{EXEC_DIR}/Faster-Whisper-XXL/faster-whisper-xxl'):
  ! cd {EXEC_DIR}; wget -N https://www.7-zip.org/a/7z2409-linux-x64.tar.xz
  ! cd {EXEC_DIR}; tar -xvf 7z2409-linux-x64.tar.xz 7zz
  ! cd {EXEC_DIR}; aria2c -x4 -s4 https://github.com/Purfview/whisper-standalone-win/releases/download/Faster-Whisper-XXL/Faster-Whisper-XXL_r245.4_linux.7z -o whisper.7z
  ! cd {EXEC_DIR}; ./7zz x whisper.7z -aoa
  ! chmod +x {EXEC_DIR}/Faster-Whisper-XXL/faster-whisper-xxl
else:
  print('Faster-Whisper-XXL/faster-whisper-xxl is already extracted, not extracting again.')

# Setup Music-Source-Separation-Training (to extract vocals)
! cd {EXEC_DIR}; git clone https://github.com/sgnoob/Mel-Band-Roformer-Vocal-Model.git
! cd {EXEC_DIR}; pip3 install -r Mel-Band-Roformer-Vocal-Model/requirements.txt
if not os.path.exists(f'{EXEC_DIR}/MelBandRoformer.ckpt'):
  ! cd {EXEC_DIR}; aria2c -x4 -s4 https://huggingface.co/KimberleyJSN/melbandroformer/resolve/main/MelBandRoformer.ckpt -o MelBandRoformer.ckpt

# Download video using yt-dlp if needed

In [None]:
if INPUT_SOURCE == "yt-dlp":
  ! yt-dlp --list-formats --extractor-arg "youtube:player_client=default,android_vr" "{VIDEO_URL}"
  RAW = ! yt-dlp --extractor-arg "youtube:player_client=default,android_vr" -f "bv+ba/b" -o "%(id)s.%(ext)s" "{VIDEO_URL}" --print filename --skip-download
  YTDLP_FILENAME = RAW[0]
  ! cd {TMP_DIR}; yt-dlp --extractor-arg "youtube:player_client=default,android_vr" -f "bv+ba/b" -o "%(id)s.%(ext)s" "{VIDEO_URL}"
  INPUT_FILE = f"{TMP_DIR}/{YTDLP_FILENAME}"
  ! ls -lRt {TMP_DIR}

  if COPY_VIDEO_TO_DRIVE:
    ! cp "{INPUT_FILE}" "{OUTPUT_DIR}/."
else:
  print(f"No need to download using yt-dlp. INPUT_SOURCE: {INPUT_SOURCE}")

# Extract vocals from input for better transcription

Using Mel-Band-Roformer for this.
* https://github.com/KimberleyJensen/Mel-Band-Roformer-Vocal-Model


In [None]:
from pathlib import Path
p = Path(INPUT_FILE)
FILENAME_STEM = f"{p.stem}"

print(f"FILENAME_STEM: {FILENAME_STEM}")
print(f"TMP_DIR: {TMP_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

if os.path.exists(f"{OUTPUT_DIR}/{FILENAME_STEM}.jp.srt"):
  print(f"{FILENAME_STEM}.jp.srt already exists, not re-running Mel-Band-Roformer.")
  print(f"Delete {FILENAME_STEM}.jp.srt from your google drive if you want to re-run.")
elif os.path.exists(f"{TMP_DIR}/vocals.flac"):
  print(f"{TMP_DIR}/vocals.flac already exists, not re-running Mel-Band-Roformer")
else:
  # We need to split the input audio into chunks to prevent Out of Memory.
  # System RAM is the limiting factor on colab.
  # TODO: Find some better method of splitting the input audio.
  # We could be splitting the audio right in the middle of a word.
  # Shouldn't be too bad since we are only splitting once every hour.
  # For MEL_SEGMENT_TIME=3600, peak system is roughly RAM: 8.1 GiB, GPU RAM: 7.0 GiB
  print(f'Using ffmpeg to split audio into {MEL_SEGMENT_TIME} seconds chunks')
  command = f"cd {TMP_DIR}; ffmpeg"
  command += f" -n -i '{INPUT_FILE}'"
  command += f" -vn -af 'aresample=44100' -acodec pcm_s16le -ac 2"
  command += f" -f segment -segment_time {MEL_SEGMENT_TIME} -reset_timestamps 1"
  command += f" part%03d.wav"
  print(f"Command: {command}")
  ! {command}

  # Run Mel-Band-Roformer to extract vocals
  print('Running Mel-Band-Roformer to extract vocals')
  command = f"cd {EXEC_DIR}/Mel-Band-Roformer-Vocal-Model; python inference.py"
  command += f" --model_type mel_band_roformer"
  command += f" --config_path configs/config_vocals_mel_band_roformer.yaml"
  command += f" --model_path {EXEC_DIR}/MelBandRoformer.ckpt"
  command += f" --input_folder {TMP_DIR}"
  command += f" --store_dir {TMP_DIR}"
  print(f"Command: {command}")
  ! {command}

  print(f'Using ffmpeg to recombine vocal chunks')
  ! cd {TMP_DIR}; printf "file '%s'\n" part*_vocals.wav | sort -V > mylist.txt
  command = f"cd {TMP_DIR}; ffmpeg"
  command += f" -n -f concat -safe 0 -i mylist.txt -acodec flac -compression_level 0 vocals.flac"
  print(f"Command: {command}")
  ! {command}
  ! ls -lRt {TMP_DIR}

# Generate Transcription
* Using https://github.com/Purfview/whisper-standalone-win

In [None]:
if os.path.exists(f"{OUTPUT_DIR}/{FILENAME_STEM}.jp.srt"):
  print(f"{FILENAME_STEM}.jp.srt already exists, not re-running whisper transcription.")
  print(f"Delete {FILENAME_STEM}.jp.srt from your google drive if you want to re-run.")
else:
  print('Preparing whisper command')
  command = f"{EXEC_DIR}/Faster-Whisper-XXL/faster-whisper-xxl"
  command += f" --model {MODEL}"
  command += f" --language ja"
  command += f" --vad_method {VAD_METHOD}"
  command += f" --vad_threshold {VAD_THRESHOLD}"
  command += f" --task transcribe"
  command += f" --output_format srt"
  command += f" --output_dir \"{TMP_DIR}\""
  command += f" -- \"{TMP_DIR}/vocals.flac\""
  print(f"Command: {command}")

  ! {command}
  ! cp {TMP_DIR}/vocals.srt {OUTPUT_DIR}/{FILENAME_STEM}.jp.srt

# Translate to English
* Using https://github.com/MaKTaiL/gemini-srt-translator

In [None]:
# Filenames
OUT_EN_FILE = f"{OUTPUT_DIR}/{FILENAME_STEM}.en.srt"
OUT_JP_FILE = f"{OUTPUT_DIR}/{FILENAME_STEM}.jp.srt"
TMP_EN_FILE = f"{TMP_DIR}/{FILENAME_STEM}.en.srt"
TMP_JP_FILE = f"{TMP_DIR}/{FILENAME_STEM}.jp.srt"
OUT_EN_JP_FILE = f"{OUTPUT_DIR}/{FILENAME_STEM}.en_jp.ass"

if os.path.exists(OUT_EN_FILE):
  print(f"{OUT_EN_FILE} already exists, not re-running gemini-srt-translator.")
  print(f"Delete {OUT_EN_FILE} from your google drive if you want to re-run.")
else:
  import gemini_srt_translator as gst
  gst.gemini_api_key = GEMINI_API_KEY_1
  gst.gemini_api_key2 = GEMINI_API_KEY_2
  gst.target_language = "English"
  gst.input_file = OUT_JP_FILE
  gst.output_file = OUT_EN_FILE
  gst.translate()

# Combine the EN and JP subtitles into an EN_JP ASS file.
# JP subtitles on top, EN subtitles below.
if os.path.exists(OUT_JP_FILE) and os.path.exists(OUT_EN_FILE):
  ! cp "{TMP_JP_FILE}" "{TMP_JP_FILE}"
  ! cp "{OUT_EN_FILE}" "{TMP_EN_FILE}"
  import pysubs2
  subs_jp = pysubs2.load(TMP_JP_FILE)
  subs_en = pysubs2.load(TMP_EN_FILE)
  subs_en_jp = pysubs2.SSAFile()
  subs_en_jp.styles = {
    "b": pysubs2.SSAStyle(alignment=pysubs2.Alignment.BOTTOM_CENTER),
    "t": pysubs2.SSAStyle(alignment=pysubs2.Alignment.TOP_CENTER),
  }
  for e in subs_en:
      e.style = "b"
      subs_en_jp.append(e)
  for e in subs_jp:
      e.style = "t"
      subs_en_jp.append(e)

  subs_en_jp.save(OUT_EN_JP_FILE)

print("DONE!!!")