# ASR + Speaker Diarization + WER/CER + JSON Output

Pipeline ini melakukan:
1. Konversi video ke WAV (16kHz mono)
2. ASR dengan Whisper
3. Speaker diarization dengan `pyannote/speaker-diarization-3.1`
4. Penggabungan kata + speaker jadi kalimat lengkap dengan timestamp
5. Evaluasi WER & CER menggunakan dataset dari Hugging Face
6. Menyimpan output akhir dalam format JSON

## 1. Import Library

In [1]:
import os
import time
import json
import subprocess

import whisper
import torch
import pandas as pd
from pyannote.audio import Pipeline
from jiwer import wer, cer
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


## 2. Konfigurasi Utama

In [18]:
# Path file utama (video / audio)
FILE_PATH = '../../data/interview_question_1.webm'  # ganti sesuai kebutuhan
WAV_PATH = FILE_PATH.rsplit('.', 1)[0] + '.wav'

# Model Whisper dan device
MODEL_SIZE = 'base.en'  # misal: tiny, base, small, medium
DEVICE = 'cpu'          # atau 'cuda' kalau ada GPU

# Konfigurasi evaluasi WER/CER
DATASET_DIR = "../../data/openlsr"  # ganti sesuai foldermu
MAX_SAMPLES = 100


## 3. Fungsi Utility

In [3]:
def format_time(seconds: float) -> str:
    """Konversi detik ke format H:M:S,ms (00:00:00,000)."""
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

def ensure_wav_16k_mono(input_path: str) -> str:
    """Pastikan ada file WAV 16kHz mono. Jika belum ada, konversi dengan ffmpeg."""
    wav_path = input_path.rsplit('.', 1)[0] + '.wav'
    if not os.path.exists(wav_path):
        print(f'Mengonversi {input_path} ke {wav_path} (16kHz, mono)...')
        subprocess.run([
            'ffmpeg', '-y', '-i', input_path,
            '-ar', '16000', '-ac', '1', wav_path
        ], check=True)
    else:
        print(f'File WAV sudah ada: {wav_path}')
    return wav_path


## 4. Load Model Whisper & Diarization

In [4]:
def load_diarization_pipeline(device: str = 'cpu'):
    """Load diarization pipeline PyAnnote 3.x."""
    print("Memuat pipeline diarisasi 'pyannote/speaker-diarization-3.1'...")

    # karena kamu sudah huggingface-cli login, tidak perlu token di sini
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1"
    )

    pipeline.to(torch.device(device))
    print("Pipeline diarisasi berhasil dimuat.")
    return pipeline


def load_whisper_model(model_size: str = "base.en", device: str = "cpu"):
    """Load Whisper model."""
    print(f"Memuat model Whisper '{model_size}' di device '{device}'...")

    model = whisper.load_model(model_size, device=device)

    print("Model Whisper berhasil dimuat.")
    return model


# Eksekusi
diarization_pipeline = load_diarization_pipeline(DEVICE)
whisper_model = load_whisper_model(MODEL_SIZE, DEVICE)


Memuat pipeline diarisasi 'pyannote/speaker-diarization-3.1'...


  from speechbrain.pretrained import (


Pipeline diarisasi berhasil dimuat.
Memuat model Whisper 'base.en' di device 'cpu'...
Model Whisper berhasil dimuat.


## 5. Proses Diarization

In [12]:
print("Current working directory:", os.getcwd())
print("Trying to load:", FILE_PATH)
print("Exists?", os.path.exists(FILE_PATH))


wav_path = ensure_wav_16k_mono(FILE_PATH)

print('\nMenjalankan diarization...')
start_diar = time.time()
diarization_result = diarization_pipeline(wav_path)
end_diar = time.time()
print(f'Diarization selesai dalam {end_diar - start_diar:.2f} detik.')

speaker_turns = []
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
    speaker_turns.append({
        'start': turn.start,
        'end': turn.end,
        'speaker': speaker
    })

speaker_df = pd.DataFrame(speaker_turns)
speaker_df.head()


Current working directory: d:\05_Personal\Asah by Dicoding\capstone-project\src\asr
Trying to load: ../../data/interview_question_1.webm
Exists? True
File WAV sudah ada: ../../data/interview_question_1.wav

Menjalankan diarization...


  std = sequences.std(dim=-1, correction=1)


Diarization selesai dalam 40.68 detik.


Unnamed: 0,start,end,speaker
0,1.364094,6.342219,SPEAKER_00
1,7.169094,11.725344,SPEAKER_00
2,13.024719,25.326594,SPEAKER_00
3,25.967844,31.671594,SPEAKER_00
4,33.072219,33.882219,SPEAKER_00


## 6. Proses ASR dengan Whisper (Word Timestamps)

In [13]:
print('\nMenjalankan transkripsi Whisper...')
start_asr = time.time()
asr_result = whisper_model.transcribe(
    FILE_PATH,
    language='en',
    word_timestamps=True
)
end_asr = time.time()
print(f'Transkripsi selesai dalam {end_asr - start_asr:.2f} detik.')

asr_result.keys()



Menjalankan transkripsi Whisper...




Transkripsi selesai dalam 9.05 detik.


dict_keys(['text', 'segments', 'language'])

## 7. Menggabungkan Kata + Speaker menjadi Kalimat Bertimestamp

In [14]:
all_words = []
for seg in asr_result.get('segments', []):
    for w in seg.get('words', []):
        all_words.append(w)

print(f'Total kata: {len(all_words)}')

word_speaker_mapping = []
for word in all_words:
    w_start = word['start']
    match = speaker_df[(speaker_df['start'] <= w_start) & (speaker_df['end'] >= w_start)]
    if not match.empty:
        spk = match.iloc[0]['speaker']
    else:
        spk = 'UNKNOWN'
    word_speaker_mapping.append({
        'start': word['start'],
        'end': word['end'],
        'word': word['word'],
        'speaker': spk
    })

final_segments = []
current = None

for w in word_speaker_mapping:
    if current is None:
        current = {
            'start': w['start'],
            'end': w['end'],
            'speaker': w['speaker'],
            'text': w['word']
        }
    else:
        if w['speaker'] == current['speaker']:
            if not current['text'].endswith(' '):
                current['text'] += ' '
            current['text'] += w['word']
            current['end'] = w['end']
        else:
            final_segments.append(current)
            current = {
                'start': w['start'],
                'end': w['end'],
                'speaker': w['speaker'],
                'text': w['word']
            }

if current is not None:
    final_segments.append(current)

print(f'Total segmen kalimat: {len(final_segments)}')

for seg in final_segments[:5]:
    print(f"[{format_time(seg['start'])} --> {format_time(seg['end'])}] {seg['speaker']}: {seg['text']}")


Total kata: 123
Total segmen kalimat: 6
[00:00:00,740 --> 00:00:01,419] UNKNOWN:  Can
[00:00:01,419 --> 00:00:06,160] SPEAKER_00:  you  share  any  specific  challenges  you  face  when  working  on  certification  and  how  you  are  coming  in?
[00:00:06,879 --> 00:00:07,440] UNKNOWN:  Ah,
[00:00:07,820 --> 00:00:54,100] SPEAKER_00:  okay  actually,  for  these  challenges,  there  are  some  challenges  when  I  took  the  certifications,  especially  for  the  projects  I  mentioned  that  I  already  working  with  it.  The  first  one  is  actually  to  meet  the  specific  accuracy  or  the  calculation  loss  for  the  application  matrix.  Actually,  that's  just  a  need  to  take  some  trial  and  error  with
[00:00:54,100 --> 00:00:55,219] UNKNOWN:  different


## 8. Evaluasi WER & CER dengan Dataset Hugging Face

In [15]:
import re

# =========================================================
# NORMALISASI TEKS (WAJIB BIAR WER REALISTIC)
# =========================================================
def normalize(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)    # hapus punctuation
    text = re.sub(r"\s+", " ", text)       # rapikan spasi
    return text.strip()

In [16]:
import os
import librosa
import numpy as np
from jiwer import wer, cer
from tqdm import tqdm

# =========================================================
# HITUNG TOTAL FILE UNTUK PROGRESS BAR
# =========================================================
def count_total_samples(root_dir):
    total = 0
    for speaker in os.listdir(root_dir):
        spk_path = os.path.join(root_dir, speaker)
        if not os.path.isdir(spk_path):
            continue
        for chapter in os.listdir(spk_path):
            trans_file = os.path.join(spk_path, chapter, f"{speaker}-{chapter}.trans.txt")
            if os.path.exists(trans_file):
                with open(trans_file, "r", encoding="utf8") as f:
                    total += len(f.readlines())
    return total


# =========================================================
# EVALUASI FULL DATASET TANPA max_samples
# =========================================================
def evaluate_dataset_full(root_dir, whisper_model):
    refs = []
    hyps = []

    print(f"\nEvaluasi full dataset lokal: {root_dir}\n")

    total_samples = count_total_samples(root_dir)
    print(f"Total sampel ditemukan: {total_samples}\n")

    pbar = tqdm(total=total_samples, desc="Processing")

    # LOOP UTAMA
    for speaker in sorted(os.listdir(root_dir)):
        spk_dir = os.path.join(root_dir, speaker)
        if not os.path.isdir(spk_dir):
            continue

        for chapter in sorted(os.listdir(spk_dir)):
            chap_dir = os.path.join(spk_dir, chapter)

            # file transkripsi
            trans_file = os.path.join(chap_dir, f"{speaker}-{chapter}.trans.txt")
            if not os.path.exists(trans_file):
                continue

            with open(trans_file, "r", encoding="utf8") as f:
                lines = f.readlines()

            # PROSES SETIAP UTTERANCE
            for line in lines:
                pbar.update(1)

                parts = line.strip().split()
                if len(parts) < 2:
                    continue

                utt_id = parts[0]
                ref_text = " ".join(parts[1:])
                audio_path = os.path.join(chap_dir, f"{utt_id}.flac")

                if not os.path.exists(audio_path):
                    print(f"Audio missing: {audio_path}")
                    continue

                # load audio
                audio, sr = librosa.load(audio_path, sr=None)
                if sr != 16000:
                    audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)

                # transkripsi Whisper
                result = whisper_model.transcribe(audio, fp16=False)
                hyp_text = result.get("text", "").strip()

                # simpan setelah normalisasi
                refs.append(normalize(ref_text))
                hyps.append(normalize(hyp_text))

    pbar.close()

    if len(refs) == 0:
        print("Tidak ada sample valid ditemukan.")
        return None

    # =========================================================
    # HITUNG METRIK
    # =========================================================
    wer_val = wer(refs, hyps)
    cer_val = cer(refs, hyps)

    print("\n=========================")
    print("         FINAL METRICS")
    print("=========================")
    print(f"WER: {wer_val:.4f}  ({wer_val * 100:.2f}%)")
    print(f"CER: {cer_val:.4f}  ({cer_val * 100:.2f}%)")
    print(f"Total sampel dievaluasi: {len(refs)}")

    return {
        "wer": wer_val,
        "wer_percent": wer_val * 100,
        "cer": cer_val,
        "cer_percent": cer_val * 100,
        "num_samples": len(refs)
    }


In [19]:
metrics = evaluate_dataset_full(DATASET_DIR, whisper_model)
metrics


Evaluasi full dataset lokal: ../../data/openlsr

Total sampel ditemukan: 2620



Processing:   1%|          | 15/2620 [00:12<40:21,  1.08it/s]

KeyboardInterrupt: 

## 9. Menyimpan Hasil Akhir ke JSON

In [None]:
output = {
    'file_path': FILE_PATH,
    'model_size': MODEL_SIZE,
    'device': DEVICE,
    'segments': [
        {
            'start_sec': seg['start'],
            'end_sec': seg['end'],
            'start_time': format_time(seg['start']),
            'end_time': format_time(seg['end']),
            'speaker': seg['speaker'],
            'text': seg['text']
        }
        for seg in final_segments
    ],
    'wer_cer_metrics': metrics
}

os.makedirs('outputs', exist_ok=True)
json_path = os.path.join('outputs', 'asr_diarization_output.json')

with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f'JSON disimpan di: {json_path}')
json_path


JSON disimpan di: outputs\asr_diarization_output.json


'outputs\\asr_diarization_output.json'