<a href="https://colab.research.google.com/github/sgnoob/subtitles/blob/main/subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to use
* https://github.com/sgnoob/subtitles?tab=readme-ov-file#how-to-use-the-colab-file


# Settings

In [None]:
# @markdown # Input
INPUT_SOURCE = "yt-dlp" # @param ["yt-dlp", "GoogleDrive"]

# @markdown ### YouTube Only
VIDEO_URL ='https://youtu.be/eXDGNKKoxZs' #@param {type:"string"}
COPY_VIDEO_TO_DRIVE = True # @param {type:"boolean"}

# @markdown ### Google Drive Only
# @markdown Please don't use names with special characters such as space.
INPUT_FILE = "input/test.mp4" #@param {type:"string"}

# @markdown ---
# @markdown # Output
OUTPUT_DIR = 'output' #@param {type:"string"}

# @markdown ---
# @markdown # Mel-Band-Roformer Settings
MEL_SEGMENT_TIME = 3600 #@param {type:"integer"}

# @markdown ---
# @markdown # Whisper Settings
MODEL = 'large-v2' #@param {type:"string"}
VAD_METHOD = 'pyannote_v3' #@param ["silero_v4_fw", "silero_v5_fw", "silero_v3", "silero_v4", "silero_v5", "pyannote_v3", "pyannote_onnx_v3", "auditok", "webrtc"]
VAD_THRESHOLD = 0.1 #@param {type:"number"}
# VAD_THRESHOLD default is 0.45

# @markdown ---
# @markdown # Gemini SRT Translator Settings
GEMINI_API_KEY_1 = 'NONE' #@param {type:"string"}
GEMINI_API_KEY_2 = 'NONE' #@param {type:"string"}
GEMINI_MODEL = 'gemini-2.5-flash' #@param {type:"string"}
GEMINI_THINKING = True # @param {type:"boolean"}
GEMINI_THINKING_BUDGET = 24576 # @param {type:"slider", min:0, max:24576, step:1}
USE_AUDIO_CONTEXT = False # @param {type:"boolean"}

from google.colab import userdata
try:
  GEMINI_API_KEY_1 = userdata.get('GEMINI_API_KEY_1')
  print("Obtained GEMINI_API_KEY_1 from Colab Secrets.")
except:
  pass
try:
  GEMINI_API_KEY_2 = userdata.get('GEMINI_API_KEY_2')
  print("Obtained GEMINI_API_KEY_2 from Colab Secrets.")
except:
  pass

GEMINI_API_KEY_1= GEMINI_API_KEY_1.strip()
GEMINI_API_KEY_2= GEMINI_API_KEY_2.strip()

if len(GEMINI_API_KEY_1) == 0 or GEMINI_API_KEY_1.lower() == "none":
  GEMINI_API_KEY_1 = None
if len(GEMINI_API_KEY_2) == 0 or GEMINI_API_KEY_2.lower() == "none":
  GEMINI_API_KEY_2 = None

if GEMINI_API_KEY_1 is None:
  print("ERROR. GEMINI_API_KEY_1 is empty. How are we going to translate?")

# @markdown ---
# @markdown # Others
DELETE_INTERMEDIATE_FILES = True # @param {type:"boolean"}


# Internal Use only
MOUNT_DIR = '/content/drive'
INPUT_FILE = f"{MOUNT_DIR}/MyDrive/{INPUT_FILE}"
OUTPUT_DIR = f"{MOUNT_DIR}/MyDrive/{OUTPUT_DIR}"

EXEC_DIR = '/content/exec'
TMP_DIR = '/content/tmp'
! mkdir -p {EXEC_DIR}
if DELETE_INTERMEDIATE_FILES:
  ! rm -rf {TMP_DIR}
! mkdir -p {TMP_DIR}

# Mount Google Drive



In [None]:
from google.colab import drive
drive.mount(MOUNT_DIR)

! mkdir -p {OUTPUT_DIR}

# Install libraries

In [None]:
import os
import base64

os.environ['MPLBACKEND'] = 'Agg' # Copied from https://github.com/Purfview/whisper-standalone-win/discussions/385

! apt-get install -y aria2
! apt-get install -y ffmpeg
! pip3 install -U gemini-srt-translator pysubs2 yt-dlp pydub

# Download Faster-Whisper-XXL
# I find the latest 7zip to be slightly faster in decompressing compared to apt-get
if not os.path.exists(f'{EXEC_DIR}/Faster-Whisper-XXL/faster-whisper-xxl'):
  ! cd {EXEC_DIR}; wget -N https://www.7-zip.org/a/7z2409-linux-x64.tar.xz
  ! cd {EXEC_DIR}; tar -xvf 7z2409-linux-x64.tar.xz 7zz
  ! cd {EXEC_DIR}; aria2c -x4 -s4 https://github.com/Purfview/whisper-standalone-win/releases/download/Faster-Whisper-XXL/Faster-Whisper-XXL_r245.4_linux.7z -o whisper.7z
  ! cd {EXEC_DIR}; ./7zz x whisper.7z -aoa
  ! chmod +x {EXEC_DIR}/Faster-Whisper-XXL/faster-whisper-xxl
else:
  print('Faster-Whisper-XXL/faster-whisper-xxl is already extracted, not extracting again.')

# Setup Music-Source-Separation-Training (to extract vocals)
! cd {EXEC_DIR}; git clone https://github.com/sgnoob/Mel-Band-Roformer-Vocal-Model.git
! cd {EXEC_DIR}; pip3 install -r Mel-Band-Roformer-Vocal-Model/requirements.txt
if not os.path.exists(f'{EXEC_DIR}/MelBandRoformer.ckpt'):
  ! cd {EXEC_DIR}; aria2c -x4 -s4 https://huggingface.co/KimberleyJSN/melbandroformer/resolve/main/MelBandRoformer.ckpt -o MelBandRoformer.ckpt

# Download video using yt-dlp if needed

In [None]:
if INPUT_SOURCE == "yt-dlp":
  ! yt-dlp --list-formats --extractor-arg "youtube:player_client=default,android_vr" "{VIDEO_URL}"
  RAW = ! yt-dlp --extractor-arg "youtube:player_client=default,android_vr" -f "bv+ba/b" -o "%(id)s.%(ext)s" "{VIDEO_URL}" --print filename --skip-download
  YTDLP_FILENAME = RAW[0]
  ! cd {TMP_DIR}; yt-dlp --extractor-arg "youtube:player_client=default,android_vr" -f "bv+ba/b" -o "%(id)s.%(ext)s" "{VIDEO_URL}"
  INPUT_FILE = f"{TMP_DIR}/{YTDLP_FILENAME}"
  ! ls -lRt {TMP_DIR}

  if COPY_VIDEO_TO_DRIVE:
    ! cp "{INPUT_FILE}" "{OUTPUT_DIR}/."
else:
  print(f"No need to download using yt-dlp. INPUT_SOURCE: {INPUT_SOURCE}")

## Sort out filenames

In [None]:
# Sort out filenames
from pathlib import Path
p = Path(INPUT_FILE)
FILENAME_STEM = f"{p.stem}"

MEL_VOCALS_FILE = f"{TMP_DIR}/vocals.wav"

OUT_EN_FILE_BASE = f"{FILENAME_STEM}.en.srt"
OUT_EN_FILE = f"{OUTPUT_DIR}/{OUT_EN_FILE_BASE}"
OUT_JP_FILE_BASE = f"{FILENAME_STEM}.jp.srt"
OUT_JP_FILE = f"{OUTPUT_DIR}/{OUT_JP_FILE_BASE}"
OUT_EN_JP_FILE = f"{OUTPUT_DIR}/{FILENAME_STEM}.en_jp.ass"

print(f"TMP_DIR: {TMP_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")
print(f"FILENAME_STEM: {FILENAME_STEM}")
print('\n')
print(f"MEL_VOCALS_FILE: {MEL_VOCALS_FILE}")
print('\n')
print(f"OUT_EN_FILE: {OUT_EN_FILE}")
print(f"OUT_JP_FILE: {OUT_JP_FILE}")
print(f"OUT_EN_JP_FILE: {OUT_EN_JP_FILE}")

# Extract vocals from input for better transcription

Using Mel-Band-Roformer for this.
* https://github.com/KimberleyJensen/Mel-Band-Roformer-Vocal-Model


In [None]:
from pydub import AudioSegment

def find_best_split_point(audio, max_time_ms, search_window_ms=60000, analysis_window_ms=100):
    """
    Find the best split point by searching backwards from max_time_ms for the quietest moment.
    """
    # Define search range - search backwards from target time
    start_search = max(0, max_time_ms - search_window_ms)
    end_search = min(len(audio), max_time_ms)
    
    if end_search - start_search < analysis_window_ms:
        return max_time_ms
    
    # Find the quietest moment by analyzing the audio in chunks
    best_quietness = float('inf')
    best_position = max_time_ms
    
    # Step through backwards, analyzing chunks of analysis_window_ms
    for pos in range(end_search - analysis_window_ms, start_search - 1, -analysis_window_ms):
        # Get segment to analyze
        test_segment = audio[pos:pos + analysis_window_ms]
        
        if len(test_segment) == analysis_window_ms:
            # Calculate RMS (Root Mean Square) for better quietness detection
            # RMS is more reliable than dBFS for finding quiet moments
            samples = test_segment.get_array_of_samples()
            if len(samples) > 0:
                rms = (sum(s**2 for s in samples) / len(samples)) ** 0.5
                
                # Lower RMS = quieter
                if rms < best_quietness:
                    best_quietness = rms
                    best_position = pos + (analysis_window_ms // 2)
    
    return best_position

def smart_split_audio(input_file, output_dir, segment_time_seconds=3600, search_window_ms=60000, analysis_window_ms=100):
    """
    Split audio file into segments at silence gaps using pydub, then convert to required format.
    """
    print(f"Loading audio file for smart splitting: {input_file}")
    
    # Load audio file
    audio = AudioSegment.from_file(input_file)
    
    # Calculate segment parameters
    segment_time_ms = segment_time_seconds * 1000
    total_duration_ms = len(audio)
    
    print(f"Total duration: {total_duration_ms / 1000:.2f} seconds")
    print(f"Target segment length: {segment_time_seconds} seconds")
    
    # Split the audio
    current_pos = 0
    segment_num = 0
    
    while current_pos < total_duration_ms:
        # Calculate target end time for this segment
        target_end = current_pos + segment_time_ms
        
        if target_end >= total_duration_ms:
            # Last segment - take everything remaining
            actual_end = total_duration_ms
        else:
            # Find best split point before target (treat target as maximum)
            actual_end = find_best_split_point(
                audio, target_end, 
                search_window_ms=search_window_ms,
                analysis_window_ms=analysis_window_ms
            )
        
        # Extract segment
        segment = audio[current_pos:actual_end]
        segment_duration = (actual_end - current_pos) / 1000
        
        # Generate output filename (matching the original ffmpeg format)
        output_file = os.path.join(output_dir, f"part{segment_num:03d}.wav")
        
        # Export segment with the required format (44100 Hz, 16-bit, stereo)
        segment = segment.set_frame_rate(44100).set_channels(2).set_sample_width(2)
        segment.export(output_file, format="wav")
        
        print(f"Segment {segment_num}: {segment_duration:.2f}s -> part{segment_num:03d}.wav")
        
        current_pos = actual_end
        segment_num += 1
    
    print(f"Smart splitting complete! Created {segment_num} segments.")
    return segment_num

# Your existing code with smart splitting integration
if os.path.exists(OUT_JP_FILE):
  print(f"{OUT_JP_FILE_BASE} already exists, not re-running Mel-Band-Roformer.")
  print(f"Delete {OUT_JP_FILE_BASE} from your google drive if you want to re-run.")
elif os.path.exists(MEL_VOCALS_FILE):
  print(f"{MEL_VOCALS_FILE} already exists, not re-running Mel-Band-Roformer")
else:
  # Smart split the input audio into chunks to prevent Out of Memory
  # This will split at silence gaps to avoid cutting words in half
  print(f'Using smart splitting to split audio into ~{MEL_SEGMENT_TIME} seconds chunks')
  
  try:
    # Try smart splitting first
    num_segments = smart_split_audio(
        INPUT_FILE, 
        TMP_DIR, 
        segment_time_seconds=MEL_SEGMENT_TIME,
        search_window_ms=60000,  # Default 60s search window for split point
        analysis_window_ms=100   # Analyze 100ms chunks for quietness
    )
    print(f"Smart splitting successful! Created {num_segments} segments.")
    
  except Exception as e:
    print(f"Smart splitting failed ({e}), falling back to ffmpeg time-based splitting...")
    # Fallback to original ffmpeg method
    command = f"cd {TMP_DIR}; ffmpeg"
    command += f" -n -i '{INPUT_FILE}'"
    command += f" -vn -af 'aresample=44100' -acodec pcm_s16le -ac 2"
    command += f" -f segment -segment_time {MEL_SEGMENT_TIME} -reset_timestamps 1"
    command += f" part%03d.wav"
    print(f"Fallback command: {command}")
    ! {command}

  # Run Mel-Band-Roformer to extract vocals
  print('Running Mel-Band-Roformer to extract vocals')
  command = f"cd {EXEC_DIR}/Mel-Band-Roformer-Vocal-Model; python inference.py"
  command += f" --model_type mel_band_roformer"
  command += f" --config_path configs/config_vocals_mel_band_roformer.yaml"
  command += f" --model_path {EXEC_DIR}/MelBandRoformer.ckpt"
  command += f" --input_folder {TMP_DIR}"
  command += f" --store_dir {TMP_DIR}"
  print(f"Command: {command}")
  ! {command}

  print(f'Using ffmpeg to recombine vocal chunks')
  ! cd {TMP_DIR}; printf "file '%s'\n" part*_vocals.wav | sort -V > mylist.txt
  command = f"cd {TMP_DIR}; ffmpeg"
  command += f" -n -f concat -safe 0 -i mylist.txt -ar 16000 -acodec pcm_s16le {MEL_VOCALS_FILE}"
  print(f"Command: {command}")
  ! {command}
  ! ls -lRt {TMP_DIR}

# Generate Transcription
* Using https://github.com/Purfview/whisper-standalone-win

In [None]:
if os.path.exists(f"{OUT_JP_FILE}"):
  print(f"{OUT_JP_FILE_BASE} already exists, not re-running whisper transcription.")
  print(f"Delete {OUT_JP_FILE_BASE} from your google drive if you want to re-run.")
else:
  print('Preparing whisper command')
  # Process each vocal part file
  part_files = sorted([f for f in os.listdir(TMP_DIR) if f.startswith('part') and f.endswith('_vocals.wav')])
  total_parts = len(part_files)
  print(f"Total parts to process: {total_parts}")

  for i, part_file in enumerate(part_files):
      print(f"Processing part {i+1}/{total_parts}: {part_file}")
      part_input_file = os.path.join(TMP_DIR, part_file)

      command = f"{EXEC_DIR}/Faster-Whisper-XXL/faster-whisper-xxl"
      command += f" --model {MODEL}"
      command += f" --language ja"
      command += f" --vad_method {VAD_METHOD}"
      command += f" --vad_threshold {VAD_THRESHOLD}"
      command += f" --task transcribe"
      command += f" --output_format srt"
      command += f" --output_dir \"{TMP_DIR}\""
      command += f" -- \"{part_input_file}\""
      print(f"Command: {command}")
      ! {command}

  # Merge the generated SRT files
  print('Merging SRT files')
  merged_srt_file = os.path.join(TMP_DIR, "merged.srt")
  with open(merged_srt_file, 'w') as outfile:
      for part_file in part_files:
          part_srt_file = os.path.join(TMP_DIR, part_file.replace('.wav', '.srt'))
          if os.path.exists(part_srt_file):
              with open(part_srt_file, 'r') as infile:
                  outfile.write(infile.read())
              # Add a newline between files if needed (adjust based on desired output)
              # outfile.write('\n')
          else:
              print(f"WARNING: SRT file not found for {part_file}.")

  # Copy the merged SRT to the output directory
  ! cp {merged_srt_file} {OUTPUT_DIR}/{FILENAME_STEM}.jp.srt

# Translate to English
* Using https://github.com/MaKTaiL/gemini-srt-translator

In [None]:
if os.path.exists(OUT_EN_FILE):
  print(f"{OUT_EN_FILE_BASE} already exists, not re-running gemini-srt-translator.")
  print(f"Delete {OUT_EN_FILE_BASE} from your google drive if you want to re-run.")
else:
  import gemini_srt_translator as gst
  gst.gemini_api_key = GEMINI_API_KEY_1
  gst.gemini_api_key2 = GEMINI_API_KEY_2
  gst.model_name = GEMINI_MODEL
  gst.target_language = "English"
  gst.input_file = OUT_JP_FILE
  gst.output_file = OUT_EN_FILE
  gst.thinking = GEMINI_THINKING
  gst.thinking_budget = GEMINI_THINKING_BUDGET
  gst.progress_log = True
  gst.thoughts_log = True
  if USE_AUDIO_CONTEXT:
    gst.audio_file = MEL_VOCALS_FILE
  gst.translate()

## Combine Japanese and English subtitles into a single .ass file
JP subtitles on top, EN subtitles below.

In [None]:
# Combine the EN and JP subtitles into an EN_JP ASS file.
# JP subtitles on top, EN subtitles below.
if os.path.exists(OUT_JP_FILE) and os.path.exists(OUT_EN_FILE):
  import pysubs2
  subs_jp = pysubs2.load(OUT_JP_FILE)
  subs_en = pysubs2.load(OUT_EN_FILE)
  subs_en_jp = pysubs2.SSAFile()
  subs_en_jp.styles = {
    "b": pysubs2.SSAStyle(alignment=pysubs2.Alignment.BOTTOM_CENTER),
    "t": pysubs2.SSAStyle(alignment=pysubs2.Alignment.TOP_CENTER),
  }
  for e in subs_en:
      e.style = "b"
      subs_en_jp.append(e)
  for e in subs_jp:
      e.style = "t"
      subs_en_jp.append(e)

  subs_en_jp.save(OUT_EN_JP_FILE)

print("DONE!!!")