In [None]:
# -*- coding: utf-8 -*-
# TRANSCRIBE: ONE SENTENCE PER LINE (optional: pre-convert to WAV)

import re, shutil, subprocess, sys
from pathlib import Path
from faster_whisper import WhisperModel

# ---------- CONFIG ----------
INPUT_PATH = "ENG_TEST_07.mp4"
MODEL_NAME = "large-v3"      # เครื่องเล็กแนะนำ "small"/"base"
DEVICE = "cpu"
COMPUTE_TYPE = "int8"        # "int8" | "int8_float32" | "float32"
PRECONVERT_TO_WAV = True     # แนะนำให้ True เพื่อเสถียรภาพ
TARGET_SR = 16000            # 16kHz mono
LANGUAGE = None              # รู้ล่วงหน้าใส่ "th" หรือ "en"

# ---------- Helpers ----------
def has_ffmpeg():
    return shutil.which("ffmpeg") is not None

def make_wav(in_path: str) -> str:
    if not has_ffmpeg():
        sys.stderr.write("[ERROR] ไม่พบ ffmpeg — PRECONVERT_TO_WAV=True ต้องมี ffmpeg\n")
        sys.exit(1)
    p = Path(in_path)
    out_wav = p.with_suffix("").as_posix() + "_16k_mono.wav"
    cmd = [
        "ffmpeg", "-y",
        "-i", in_path,
        "-vn",
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", str(TARGET_SR),
        "-sample_fmt", "s16",
        out_wav
    ]
    print("[FFmpeg]", " ".join(cmd))
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return out_wav
    except subprocess.CalledProcessError as e:
        sys.stderr.write(f"[ERROR] FFmpeg ล้มเหลว: {e}\n"); sys.exit(1)

# Thai sentence tokenizer (optional)
try:
    from pythainlp.tokenize import sent_tokenize as th_sent_tokenize
    def split_th_sentences(text: str):
        sents = [s.strip() for s in th_sent_tokenize(text, engine="newmm") if s.strip()]
        return sents
    TH_BACKEND = "pythainlp"
except Exception:
    def split_th_sentences(text: str):
        # fallback ง่ายๆ: ตัดหลัง ., !, ?, … หรือ ฯ
        return [s.strip() for s in re.split(r'(?<=[\.!?…]|ฯ)\s+', text) if s.strip()]
    TH_BACKEND = "fallback"

# Basic English splitter with tiny abbr handling
EN_ABBR = {"mr.", "mrs.", "ms.", "dr.", "prof.", "sr.", "jr.", "vs.", "etc.", "e.g.", "i.e.", "u.s.", "u.k."}
def split_en_sentences(text: str):
    t = re.sub(r"\s+", " ", text).strip()
    if not t: return []
    parts = re.split(r'(\s*[.!?]+["\')\]]*\s+)', t)  # keep delimiters
    chunks, buf = [], ""
    for i, part in enumerate(parts):
        buf += part
        if i % 2 == 1:  # just consumed a delimiter
            cand = buf.strip()
            last_tok = cand.split()[-1].lower() if cand.split() else ""
            if last_tok in EN_ABBR:
                continue
            chunks.append(cand.strip())
            buf = ""
    if buf.strip():
        chunks.append(buf.strip())
    return chunks

def split_sentences(text: str, lang_hint: str|None):
    txt = re.sub(r"\s+", " ", text).strip()
    if not txt: return []
    has_th = re.search(r"[\u0e00-\u0e7f]", txt) is not None
    lang = lang_hint or ("th" if has_th else "en")
    if lang == "th":
        return split_th_sentences(txt)
    else:
        return split_en_sentences(txt)

# ---------- MAIN ----------
def main():
    src = INPUT_PATH
    base = Path(INPUT_PATH).with_suffix("").name
    out_txt = f"{base}_sentences.txt"

    if PRECONVERT_TO_WAV:
        src = make_wav(INPUT_PATH)

    print("Loading model...")
    model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)

    print(f"Transcribing: {src}")
    segments, info = model.transcribe(
        src,
        vad_filter=True, vad_parameters={"min_silence_duration_ms": 300},
        beam_size=1,
        language=LANGUAGE,            # ตั้งเป็น "th"/"en" ถ้ารู้แน่
        condition_on_previous_text=False,
        temperature=0.0
    )

    # รวมทุก segment -> แล้วค่อย "ตัดเป็นประโยค" -> หนึ่งประโยคต่อหนึ่งบรรทัด
    raw_chunks = []
    for seg in segments:
        t = (seg.text or "").strip()
        if t:
            raw_chunks.append(t)
    raw_text = re.sub(r"\s+", " ", " ".join(raw_chunks)).strip()

    sentences = split_sentences(raw_text, LANGUAGE)
    one_sentence_per_line = "\n".join(sentences)

    print("\n=== TRANSCRIPT (ONE SENTENCE PER LINE) ===\n")
    print(one_sentence_per_line)

    with open(out_txt, "w", encoding="utf-8") as f:
        f.write(one_sentence_per_line + "\n")

    print(f"\nSaved TXT: {out_txt}")
    if PRECONVERT_TO_WAV:
        print(f"(ใช้ WAV ชั่วคราว: {src})")

if __name__ == "__main__":
    main()
