# ASR + Speaker Diarization + WER/CER + JSON Output

Pipeline ini melakukan:
1. Konversi video ke WAV (16kHz mono)
2. ASR dengan Whisper
3. Speaker diarization dengan `pyannote/speaker-diarization-3.1`
4. Penggabungan kata + speaker jadi kalimat lengkap dengan timestamp
5. Evaluasi WER & CER menggunakan dataset dari Hugging Face
6. Menyimpan output akhir dalam format JSON

## 1. Import Library

In [2]:
import os
import time
import json
import subprocess

import whisper
import torch
import pandas as pd
from pyannote.audio import Pipeline
from jiwer import wer, cer
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


## 2. Konfigurasi Utama

In [3]:
# Path file utama (video / audio)
FILE_PATH = 'data/interview_question_5.webm'  # ganti sesuai kebutuhan
WAV_PATH = FILE_PATH.rsplit('.', 1)[0] + '.wav'

# Model Whisper dan device
MODEL_SIZE = 'base.en'  # misal: tiny, base, small, medium
DEVICE = 'cpu'          # atau 'cuda' kalau ada GPU

# Konfigurasi evaluasi WER/CER (Hugging Face dataset)
HF_DATASET_NAME = 'rakshya34/filtered_english_female_voice_v1'  # contoh
HF_SPLIT = 'train'  # ganti jika perlu (train/validation/test)
MAX_EVAL_SAMPLES = 100  # batasi jumlah sampel untuk demo


## 3. Fungsi Utility

In [4]:
def format_time(seconds: float) -> str:
    """Konversi detik ke format H:M:S,ms (00:00:00,000)."""
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

def ensure_wav_16k_mono(input_path: str) -> str:
    """Pastikan ada file WAV 16kHz mono. Jika belum ada, konversi dengan ffmpeg."""
    wav_path = input_path.rsplit('.', 1)[0] + '.wav'
    if not os.path.exists(wav_path):
        print(f'Mengonversi {input_path} ke {wav_path} (16kHz, mono)...')
        subprocess.run([
            'ffmpeg', '-y', '-i', input_path,
            '-ar', '16000', '-ac', '1', wav_path
        ], check=True)
    else:
        print(f'File WAV sudah ada: {wav_path}')
    return wav_path


## 4. Load Model Whisper & Diarization

In [5]:
def load_diarization_pipeline(device: str = 'cpu'):
    """Load diarization pipeline PyAnnote 3.x."""
    print("Memuat pipeline diarisasi 'pyannote/speaker-diarization-3.1'...")

    # karena kamu sudah huggingface-cli login, tidak perlu token di sini
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1"
    )

    pipeline.to(torch.device(device))
    print("Pipeline diarisasi berhasil dimuat.")
    return pipeline


def load_whisper_model(model_size: str = "base.en", device: str = "cpu"):
    """Load Whisper model."""
    print(f"Memuat model Whisper '{model_size}' di device '{device}'...")

    model = whisper.load_model(model_size, device=device)

    print("Model Whisper berhasil dimuat.")
    return model


# Eksekusi
diarization_pipeline = load_diarization_pipeline(DEVICE)
whisper_model = load_whisper_model(MODEL_SIZE, DEVICE)


Memuat pipeline diarisasi 'pyannote/speaker-diarization-3.1'...


  from speechbrain.pretrained import (


Pipeline diarisasi berhasil dimuat.
Memuat model Whisper 'base.en' di device 'cpu'...
Model Whisper berhasil dimuat.


## 5. Proses Diarization

In [6]:
wav_path = ensure_wav_16k_mono(FILE_PATH)

print('\nMenjalankan diarization...')
start_diar = time.time()
diarization_result = diarization_pipeline(wav_path)
end_diar = time.time()
print(f'Diarization selesai dalam {end_diar - start_diar:.2f} detik.')

speaker_turns = []
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
    speaker_turns.append({
        'start': turn.start,
        'end': turn.end,
        'speaker': speaker
    })

speaker_df = pd.DataFrame(speaker_turns)
speaker_df.head()


File WAV sudah ada: data/interview_question_5.wav

Menjalankan diarization...


  std = sequences.std(dim=-1, correction=1)


Diarization selesai dalam 58.58 detik.


Unnamed: 0,start,end,speaker
0,0.739719,5.363469,SPEAKER_00
1,6.139719,7.337844,SPEAKER_00
2,7.793469,10.054719,SPEAKER_00
3,10.274094,10.611594,SPEAKER_00
4,11.084094,13.345344,SPEAKER_00


## 6. Proses ASR dengan Whisper (Word Timestamps)

In [7]:
print('\nMenjalankan transkripsi Whisper...')
start_asr = time.time()
asr_result = whisper_model.transcribe(
    FILE_PATH,
    language='en',
    word_timestamps=True
)
end_asr = time.time()
print(f'Transkripsi selesai dalam {end_asr - start_asr:.2f} detik.')

asr_result.keys()



Menjalankan transkripsi Whisper...




Transkripsi selesai dalam 17.45 detik.


dict_keys(['text', 'segments', 'language'])

## 7. Menggabungkan Kata + Speaker menjadi Kalimat Bertimestamp

In [8]:
all_words = []
for seg in asr_result.get('segments', []):
    for w in seg.get('words', []):
        all_words.append(w)

print(f'Total kata: {len(all_words)}')

word_speaker_mapping = []
for word in all_words:
    w_start = word['start']
    match = speaker_df[(speaker_df['start'] <= w_start) & (speaker_df['end'] >= w_start)]
    if not match.empty:
        spk = match.iloc[0]['speaker']
    else:
        spk = 'UNKNOWN'
    word_speaker_mapping.append({
        'start': word['start'],
        'end': word['end'],
        'word': word['word'],
        'speaker': spk
    })

final_segments = []
current = None

for w in word_speaker_mapping:
    if current is None:
        current = {
            'start': w['start'],
            'end': w['end'],
            'speaker': w['speaker'],
            'text': w['word']
        }
    else:
        if w['speaker'] == current['speaker']:
            if not current['text'].endswith(' '):
                current['text'] += ' '
            current['text'] += w['word']
            current['end'] = w['end']
        else:
            final_segments.append(current)
            current = {
                'start': w['start'],
                'end': w['end'],
                'speaker': w['speaker'],
                'text': w['word']
            }

if current is not None:
    final_segments.append(current)

print(f'Total segmen kalimat: {len(final_segments)}')

for seg in final_segments[:5]:
    print(f"[{format_time(seg['start'])} --> {format_time(seg['end'])}] {seg['speaker']}: {seg['text']}")


Total kata: 222
Total segmen kalimat: 17
[00:00:00,000 --> 00:00:00,920] UNKNOWN:  Let's
[00:00:00,920 --> 00:00:05,680] SPEAKER_00:  try  the  process  of  building  more  controls.  Let's
[00:00:05,680 --> 00:00:06,440] UNKNOWN:  fix
[00:00:06,440 --> 00:00:18,500] SPEAKER_00:  it.  See  and  everyone.  So,  at  the  first  time,  of  course,  we  need  to  make  sure  there  are  split,
[00:00:19,140 --> 00:00:20,820] UNKNOWN:  the


## 8. Evaluasi WER & CER dengan Dataset Hugging Face

In [9]:
import numpy as np
import librosa
from datasets import load_dataset
from jiwer import wer, cer

def evaluate_on_hf_dataset(dataset_name: str, split: str = 'test', max_samples: int = 10):
    """Hitung WER & CER rata-rata pada subset dataset Hugging Face.

    Asumsi kolom:
      - 'audio' berisi info audio (dengan field 'array' dan 'sampling_rate')
      - 'text' berisi transkrip ground truth
    """
    print(f'\nMemuat dataset: {dataset_name} ({split})...')
    ds = load_dataset(dataset_name, split=split)

    if max_samples is not None:
        ds = ds.select(range(min(len(ds), max_samples)))

    refs = []
    hyps = []

    for i, sample in enumerate(ds):
        ref_text = sample.get('text', '').strip()
        if not ref_text:
            continue

        audio_info = sample['audio']

        # Ambil waveform & sampling rate dari dataset
        if isinstance(audio_info, dict) and 'array' in audio_info:
            audio_array = np.array(audio_info['array'], dtype=np.float32)
            sr = audio_info.get('sampling_rate', ds.features['audio'].sampling_rate)
        else:
            print(f'Sample {i}: audio tidak punya array yang jelas, dilewati.')
            continue

        # Resample ke 16kHz (standar Whisper)
        target_sr = 16000
        if sr != target_sr:
            audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)

        # Transkripsi langsung dari waveform, tanpa ffmpeg
        result = whisper_model.transcribe(
            audio_array,
            language='en',
            fp16=False  # kamu di CPU, jadi pakai FP32
        )
        hyp_text = result.get('text', '').strip()

        if not hyp_text:
            continue

        refs.append(ref_text)
        hyps.append(hyp_text)

        print(f'Sample {i}')
        print('REF:', ref_text)
        print('HYP:', hyp_text)
        print('---')

    if not refs:
        print('Tidak ada pasangan REF-HYP yang valid.')
        return None

    wer_score = wer(refs, hyps)
    cer_score = cer(refs, hyps)

    print(f'\nWER rata-rata: {wer_score:.4f}')
    print(f'CER rata-rata: {cer_score:.4f}')

    return {
        'wer': wer_score,
        'cer': cer_score,
        'num_samples': len(refs)
    }

# Panggil seperti biasa:
metrics = evaluate_on_hf_dataset(HF_DATASET_NAME, HF_SPLIT, MAX_EVAL_SAMPLES)
metrics



Memuat dataset: rakshya34/filtered_english_female_voice_v1 (train)...


Downloading readme: 100%|██████████| 533/533 [00:00<?, ?B/s] 
Downloading data: 100%|██████████| 18/18 [1:04:01<00:00, 213.42s/files]
Generating train split: 100%|██████████| 216130/216130 [00:42<00:00, 5040.63 examples/s]


Sample 0
REF: "The Chronicle" is supported by the student activity fee and advertising.
HYP: The clinical is supported by the student activity few and advertising.
---
Sample 1
REF: He denied all allegations, although he later reached a settlement with Evangelista.
HYP: He denied all allegations of all he later reached a certain way to me, a venge on this start.
---
Sample 2
REF: Rock has one son, Alexander John, with his wife Liza.
HYP: No Kazwan-san Alexander-John with his wife Lisa.
---
Sample 3
REF: The park features a large pond, skate park and several football pitches.
HYP: The park features a large pond, skate park and several football beaches.
---
Sample 4
REF: The Greater Glenmont Civic Association won several awards and grants for its activities.
HYP: The Greater Glenmont Civic Association won several awards and grants for its activities.
---
Sample 5
REF: As a part of this initiative Noir repainted several of his original works.
HYP: As a part of this initiative, Noa repaint

{'wer': 0.20173745173745175, 'cer': 0.07359098228663447, 'num_samples': 100}

## 9. Menyimpan Hasil Akhir ke JSON

In [10]:
output = {
    'file_path': FILE_PATH,
    'model_size': MODEL_SIZE,
    'device': DEVICE,
    'segments': [
        {
            'start_sec': seg['start'],
            'end_sec': seg['end'],
            'start_time': format_time(seg['start']),
            'end_time': format_time(seg['end']),
            'speaker': seg['speaker'],
            'text': seg['text']
        }
        for seg in final_segments
    ],
    'wer_cer_metrics': metrics
}

os.makedirs('outputs', exist_ok=True)
json_path = os.path.join('outputs', 'asr_diarization_output.json')

with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f'JSON disimpan di: {json_path}')
json_path


JSON disimpan di: outputs\asr_diarization_output.json


'outputs\\asr_diarization_output.json'