# IMPORT LIBRARY

In [1]:
import whisper
import os
import time
import pandas as pd
from pyannote.audio import Pipeline
import torch
from huggingface_hub import HfFolder
import subprocess
from jiwer import wer
import numpy as np
np.NAN = np.nan
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# CONFIG

In [2]:
FILE_PATH = "data/wawancara2.mp4"
WAV_PATH = FILE_PATH.replace(".mp4", ".wav")
MODEL_SIZE = "base.en"
DEVICE = "cpu"

# PREPROCESSING

In [3]:
if not os.path.exists(WAV_PATH):
    print("Mengonversi file ke format WAV...")
    subprocess.run([
        "ffmpeg", "-i", FILE_PATH, "-ar", "16000", "-ac", "1", WAV_PATH
    ], check=True)

In [4]:
def format_time(seconds):
    """Konversi detik ke format H:M:S,ms"""
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

# LOAD MODEL WHISPER & PYANNOTE

In [5]:
def run_full_process():
    print("--- Memulai Proses Transkripsi & Diarisasi ---")
    
    # 1. Validasi File
    print(f"Mengecek file di: {FILE_PATH}")
    if not os.path.exists(FILE_PATH):
        print("\n" + "!"*50 + f"\n  ERROR: FILE '{FILE_PATH}' TIDAK DITEMUKAN!\n" + "!"*50 + "\n")
        return

    print("File ditemukan. Lanjut...")


In [6]:
# 2. Muat Model Diarisasi (pyannote.audio)
try:
    print(f"\nMemuat pipeline diarisasi 'pyannote/speaker-diarization-3.1'...")

    token = HfFolder.get_token()  # Ambil token HF
    diarization_pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=token
    )

    diarization_pipeline.to(torch.device(DEVICE))
    print("Pipeline diarisasi berhasil dimuat.")
except Exception as e:
    print(f"\n!!! ERROR saat memuat pipeline diarisasi: {e}")
    print("Pastikan token HF di-set dengan benar: huggingface-cli login")



Memuat pipeline diarisasi 'pyannote/speaker-diarization-3.1'...


The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
torchvision is not available - cannot save figures
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.



!!! ERROR saat memuat pipeline diarisasi: 'introspection'
Pastikan token HF di-set dengan benar: huggingface-cli login




In [7]:
# 3. Muat Model Transkripsi (Whisper)
try:
    print(f"\nMemuat model Whisper '{MODEL_SIZE}' ke '{DEVICE}'...")
    whisper_model = whisper.load_model(MODEL_SIZE, device=DEVICE)
    print("Model Whisper berhasil dimuat.")
except Exception as e:
    print(f"\n!!! ERROR saat memuat model Whisper: {e}")



Memuat model Whisper 'base.en' ke 'cpu'...
Model Whisper berhasil dimuat.


# PROSES ASR + DIARIZATION

In [8]:
# 4. Proses Diarisasi
print("\nMemulai diarisasi (mendeteksi siapa bicara kapan)...")
start_diarize_time = time.time()
diarization_result = diarization_pipeline(WAV_PATH)
end_diarize_time = time.time()
print(f"Diarisasi selesai dalam {end_diarize_time - start_diarize_time:.2f} detik.")


Memulai diarisasi (mendeteksi siapa bicara kapan)...


NameError: name 'diarization_pipeline' is not defined

# TRANSKRIPSI

In [9]:
# 5. Proses Transkripsi dengan Word Timestamps
print("\nMemulai transkripsi (mengubah audio ke teks)...")
start_transcribe_time = time.time()
transcription_result = whisper_model.transcribe(
    WAV_PATH,
    language="en",
    word_timestamps=True
)
end_transcribe_time = time.time()
print(f"Transkripsi selesai dalam {end_transcribe_time - start_transcribe_time:.2f} detik.")


Memulai transkripsi (mengubah audio ke teks)...




Transkripsi selesai dalam 32.23 detik.


# GABUNG ASR + DIARIZATION

In [None]:
print("\nMenggabungkan hasil transkripsi dan diarisasi...")

# Ambil intervals speaker
speaker_turns = []
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
    speaker_turns.append({
        'start': turn.start,
        'end': turn.end,
        'speaker': speaker
    })

speaker_df = pd.DataFrame(speaker_turns)

# Ambil semua kata dari Whisper
all_words = []
for segment in transcription_result['segments']:
    all_words.extend(segment['words'])



Menggabungkan hasil transkripsi dan diarisasi...


# MAPPING WORD

In [None]:
word_speaker_mapping = []

for word in all_words:
    word_start = word['start']

    match = speaker_df[
        (speaker_df['start'] <= word_start) &
        (speaker_df['end'] >= word_start)
    ]

    if not match.empty:
        word['speaker'] = match.iloc[0]['speaker']
    else:
        word['speaker'] = "UNKNOWN"

    word_speaker_mapping.append(word)

# GABUNGKAN JADI SEGMEN

In [None]:
final_transcript = []
current = None

for word in word_speaker_mapping:
    if current is None:
        current = {
            'start': word['start'],
            'end': word['end'],
            'speaker': word['speaker'],
            'text': word['word'] + " "
        }
    else:
        if word['speaker'] == current['speaker']:
            current['text'] += word['word'] + " "
            current['end'] = word['end']
        else:
            final_transcript.append(current)
            current = {
                'start': word['start'],
                'end': word['end'],
                'speaker': word['speaker'],
                'text': word['word'] + " "
            }

if current:
    final_transcript.append(current)


In [None]:
print("\n" + "="*80)
print("                      HASIL TRANSKRIPSI DENGAN DIARISASI")
print("="*80 + "\n")

for seg in final_transcript:
    start = format_time(seg['start'])
    end = format_time(seg['end'])
    speaker = seg['speaker']
    text = seg['text'].strip()
    print(f"[{start} --> {end}] {speaker}: {text}")

print("\n" + "="*80)
print("--- Proses Selesai ---")



                      HASIL TRANSKRIPSI DENGAN DIARISASI

[00:00:00,000 --> 00:00:00,360] UNKNOWN: Welcome
[00:00:00,360 --> 00:00:01,820] SPEAKER_01: to  English  Learner's  Academy.
[00:00:04,800 --> 00:00:05,339] UNKNOWN: Job
[00:00:05,339 --> 00:00:05,780] SPEAKER_01: Interview
[00:00:08,700 --> 00:00:09,240] UNKNOWN: Hello,
[00:00:09,720 --> 00:00:10,419] SPEAKER_01: why  are  you  here?
[00:00:11,420 --> 00:00:11,960] UNKNOWN: Hello,
[00:00:12,300 --> 00:00:13,859] SPEAKER_00: I  am  here  for  a  job  interview.
[00:00:15,480 --> 00:00:16,019] UNKNOWN: What
[00:00:16,019 --> 00:00:16,660] SPEAKER_00: is  your  name?
[00:00:17,679 --> 00:00:18,219] UNKNOWN: My
[00:00:18,219 --> 00:00:18,960] SPEAKER_00: name  is  Lucas.
[00:00:20,220 --> 00:00:20,760] UNKNOWN: How
[00:00:20,760 --> 00:00:22,199] SPEAKER_00: did  you  find  out  about  this  job?
[00:00:23,399 --> 00:00:23,699] UNKNOWN: I
[00:00:23,699 --> 00:00:24,879] SPEAKER_00: saw  an  ad  on  your  website.
[00:00:26,179 --

In [None]:
run_full_process()

--- Memulai Proses Transkripsi & Diarisasi ---
Mengecek file di: data/wawancara2.mp4
File ditemukan. Lanjut...


# EVALUATION WER