# üéôÔ∏è Japanese Audio ‚Üí SRT Subtitles (JP + EN)

This notebook:
1. **Transcribes** uploaded Japanese audio using [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) with word-level timestamps via the ForcedAligner
2. **Generates** an SRT subtitle file from the transcription
3. **Translates** the Japanese SRT to English using [Helsinki-NLP/opus-mt-ja-en](https://huggingface.co/Helsinki-NLP/opus-mt-ja-en)

**Requirements:** A Colab runtime with a **T4 GPU** (free tier works).

> ‚ö†Ô∏è Make sure you've selected **Runtime ‚Üí Change runtime type ‚Üí T4 GPU** before running.

## 1 ¬∑ Install Dependencies

In [None]:
!pip install -q qwen-asr transformers sentencepiece sacremoses

## 2 ¬∑ Upload Japanese Audio File

Supported formats: `.wav`, `.mp3`, `.flac`, `.ogg`, `.m4a`, etc.

In [None]:
import ipywidgets as widgets
from IPython.display import display, Audio, HTML
import os

uploader = widgets.FileUpload(
    accept=".wav,.mp3,.flac,.ogg,.m4a,.aac,.wma,.opus",
    multiple=False,
    description="Select audio",
    button_style="primary",
    layout=widgets.Layout(width="300px"),
)

status = widgets.HTML(value="<i>No file selected.</i>")

AUDIO_PATH = None


def on_upload(change):
    global AUDIO_PATH
    uploaded = change["new"]
    if uploaded:
        file_info = uploaded[0]
        name = file_info["name"]
        content = file_info["content"]
        AUDIO_PATH = os.path.join("/content", name)
        with open(AUDIO_PATH, "wb") as f:
            f.write(content)
        size_mb = len(content) / (1024 * 1024)
        status.value = f"‚úÖ <b>{name}</b> uploaded ({size_mb:.1f} MB)"


uploader.observe(on_upload, names="value")
display(widgets.VBox([uploader, status]))

In [None]:
# Preview the uploaded audio
if AUDIO_PATH and os.path.exists(AUDIO_PATH):
    display(Audio(AUDIO_PATH))
else:
    print("‚ö†Ô∏è  Please upload an audio file in the cell above first.")

## 3 ¬∑ Transcribe with Qwen3-ASR-1.7B

Loads the ASR model and the ForcedAligner to get word-level timestamps.

In [None]:
import torch
from qwen_asr import Qwen3ASRModel

assert AUDIO_PATH and os.path.exists(AUDIO_PATH), (
    "No audio file found. Run the upload cell above first."
)

print("Loading Qwen3-ASR-1.7B + ForcedAligner ‚Ä¶")
asr_model = Qwen3ASRModel.from_pretrained(
    "Qwen/Qwen3-ASR-1.7B",
    dtype=torch.bfloat16,
    device_map="cuda:0",
    max_inference_batch_size=32,
    max_new_tokens=4096,  # long audio support
    forced_aligner="Qwen/Qwen3-ForcedAligner-0.6B",
    forced_aligner_kwargs=dict(
        dtype=torch.bfloat16,
        device_map="cuda:0",
    ),
)
print("‚úÖ Models loaded.")

In [None]:
print(f"Transcribing: {os.path.basename(AUDIO_PATH)} ‚Ä¶")

results = asr_model.transcribe(
    audio=AUDIO_PATH,
    language="Japanese",
    return_time_stamps=True,
)

result = results[0]
print(f"\nDetected language: {result.language}")
print(f"Full text:\n{result.text}")
print(f"\nTimestamp segments: {len(result.time_stamps[0])}")

## 4 ¬∑ Generate SRT Subtitles

In [None]:
from datetime import timedelta


def format_srt_time(seconds: float) -> str:
    """Convert seconds to SRT timestamp format: HH:MM:SS,mmm"""
    td = timedelta(seconds=seconds)
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    secs = total_seconds % 60
    millis = int(td.microseconds / 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"


def group_timestamps_to_subtitles(
    stamps, max_chars: int = 40, max_duration: float = 7.0, gap_threshold: float = 0.6
):
    """
    Group word-level timestamps into subtitle segments.

    Args:
        stamps: list of timestamp objects with .text, .start_time, .end_time
        max_chars: max characters per subtitle line
        max_duration: max duration (seconds) per subtitle
        gap_threshold: silence gap (seconds) that forces a new subtitle
    """
    if not stamps:
        return []

    subtitles = []
    current_text = ""
    current_start = stamps[0].start_time
    current_end = stamps[0].end_time

    for i, stamp in enumerate(stamps):
        # Decide whether to start a new subtitle
        start_new = False
        if i == 0:
            current_text = stamp.text
            current_start = stamp.start_time
            current_end = stamp.end_time
            continue

        # Check gap between previous and current word
        gap = stamp.start_time - current_end
        new_duration = stamp.end_time - current_start
        new_len = len(current_text) + len(stamp.text)

        if gap > gap_threshold or new_duration > max_duration or new_len > max_chars:
            start_new = True

        if start_new:
            subtitles.append((current_start, current_end, current_text.strip()))
            current_text = stamp.text
            current_start = stamp.start_time
            current_end = stamp.end_time
        else:
            current_text += stamp.text
            current_end = stamp.end_time

    # Don't forget the last segment
    if current_text.strip():
        subtitles.append((current_start, current_end, current_text.strip()))

    return subtitles


def build_srt(subtitles) -> str:
    """Build SRT string from list of (start, end, text) tuples."""
    lines = []
    for idx, (start, end, text) in enumerate(subtitles, 1):
        lines.append(str(idx))
        lines.append(f"{format_srt_time(start)} --> {format_srt_time(end)}")
        lines.append(text)
        lines.append("")  # blank line separator
    return "\n".join(lines)


# Build Japanese SRT
stamps = result.time_stamps[0]
subtitles_ja = group_timestamps_to_subtitles(stamps)

srt_ja = build_srt(subtitles_ja)

# Save
base_name = os.path.splitext(os.path.basename(AUDIO_PATH))[0]
srt_ja_path = f"/content/{base_name}_ja.srt"
with open(srt_ja_path, "w", encoding="utf-8") as f:
    f.write(srt_ja)

print(f"‚úÖ Japanese SRT saved to: {srt_ja_path}")
print(f"   {len(subtitles_ja)} subtitle segments\n")
print("--- Preview (first 10 segments) ---")
print("\n".join(srt_ja.split("\n")[:40]))

## 5 ¬∑ Translate Subtitles to English

Uses [Helsinki-NLP/opus-mt-ja-en](https://huggingface.co/Helsinki-NLP/opus-mt-ja-en) ‚Äî a lightweight MarianMT model specifically trained for Japanese ‚Üí English translation. It runs comfortably on Colab alongside the ASR model.

In [None]:
from transformers import MarianMTModel, MarianTokenizer

TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-ja-en"

print(f"Loading translation model: {TRANSLATION_MODEL} ‚Ä¶")
trans_tokenizer = MarianTokenizer.from_pretrained(TRANSLATION_MODEL)
trans_model = MarianMTModel.from_pretrained(TRANSLATION_MODEL).to("cuda")
print("‚úÖ Translation model loaded.")

In [None]:
def translate_texts(texts: list[str], batch_size: int = 32) -> list[str]:
    """Translate a list of Japanese texts to English in batches."""
    translations = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        inputs = trans_tokenizer(
            batch, return_tensors="pt", padding=True, truncation=True, max_length=512
        ).to("cuda")
        with torch.no_grad():
            output_ids = trans_model.generate(**inputs, max_length=512)
        decoded = trans_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        translations.extend(decoded)
    return translations


# Extract Japanese texts from subtitles
ja_texts = [text for _, _, text in subtitles_ja]

print(f"Translating {len(ja_texts)} subtitle segments ‚Ä¶")
en_texts = translate_texts(ja_texts)
print("‚úÖ Translation complete.")

# Build English subtitles with original timings
subtitles_en = [
    (start, end, en_text) for (start, end, _), en_text in zip(subtitles_ja, en_texts)
]

srt_en = build_srt(subtitles_en)

# Save
srt_en_path = f"/content/{base_name}_en.srt"
with open(srt_en_path, "w", encoding="utf-8") as f:
    f.write(srt_en)

print(f"\n‚úÖ English SRT saved to: {srt_en_path}")
print(f"   {len(subtitles_en)} subtitle segments\n")
print("--- Preview (first 10 segments) ---")
print("\n".join(srt_en.split("\n")[:40]))

## 6 ¬∑ Side-by-Side Comparison

In [None]:
print(f"{'#':>3}  {'Time':^27}  {'Japanese':<40}  {'English':<40}")
print("‚îÄ" * 115)
for i, ((s, e, ja), (_, _, en)) in enumerate(zip(subtitles_ja, subtitles_en), 1):
    time_str = f"{format_srt_time(s)} ‚Üí {format_srt_time(e)}"
    print(f"{i:>3}  {time_str}  {ja:<40}  {en:<40}")
    if i >= 30:
        remaining = len(subtitles_ja) - 30
        if remaining > 0:
            print(f"\n... and {remaining} more segments.")
        break

## 7 ¬∑ Download SRT Files

In [None]:
try:
    from google.colab import files

    print("Downloading Japanese SRT ‚Ä¶")
    files.download(srt_ja_path)
    print("Downloading English SRT ‚Ä¶")
    files.download(srt_en_path)
except ImportError:
    print("Not running in Colab ‚Äî files saved at:")
    print(f"  Japanese: {srt_ja_path}")
    print(f"  English:  {srt_en_path}")

## 8 ¬∑ Cleanup (Optional)

Free GPU memory if you want to run other things in this session.

In [None]:
import gc

del asr_model, trans_model, trans_tokenizer
gc.collect()
torch.cuda.empty_cache()
print("‚úÖ GPU memory freed.")