In [1]:
import os
import re
import string
import difflib
from glob import glob
from collections import Counter

import jiwer

In [2]:
# --- Configuration ---
# Folders for reference (.txt) and ASR output (.srt)
reference_folder = "transcripts/original_transcripts" 
fw_asr_folder = "transcripts/faster_whisper_transcripts"
wb_asr_folder = "transcripts/whisper_base_transcripts"
wt_asr_folder = "transcripts/whisper_tiny_transcripts"



In [3]:
# Find all the reference text files
reference_files = glob(os.path.join(reference_folder, "*.txt"))


In [4]:
def normalize(text):
    """Lowercase, remove punctuation, and collapse whitespace."""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [5]:
def extract_text_from_srt(srt_path):
    """Extracts only the dialogue text from an SRT subtitle file."""
    with open(srt_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    
    text_lines = []
    # Regex to identify timestamp lines like "00:00:01,234 --> 00:00:05,678"
    timestamp_re = re.compile(r'\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}')
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if re.match(r'^\d+$', line):  # Skip subtitle number lines
            continue
        if timestamp_re.match(line): # Skip timestamp lines
            continue
        text_lines.append(line)
        
    return " ".join(text_lines)

In [6]:

# --- Accumulators for error analysis ---
wer_scores = []
sub_counts = Counter()   # (reference_word, hypothesis_word) -> count
ins_counts = Counter()   # inserted_word -> count
del_counts = Counter()   # deleted_word -> count

### Whisper Base

In [7]:
for reference_file in reference_files:
    filename = os.path.splitext(os.path.basename(reference_file))[0]
    asr_file = os.path.join(wb_asr_folder, filename + ".srt")
    
    if not os.path.exists(asr_file):
        print(f"SKIPPING: ASR file missing for {filename}")
        continue

    # --- Get Reference and Hypothesis Text ---
    
    # 1. Reference text from the clean .txt file
    with open(reference_file, "r", encoding="utf-8") as f:
        reference_text = f.read()
    
    # NEW: Remove bracketed text like [laughs] or [unintelligible]
    reference_text = re.sub(r'\[.*?\]', '', reference_text)
    
    reference = normalize(reference_text)

    # 2. Hypothesis text from the ASR's .srt file
    hypothesis_text = extract_text_from_srt(asr_file)
    hypothesis = normalize(hypothesis_text)

    # --- Calculate WER ---
    wer = jiwer.wer(reference, hypothesis)
    wer_scores.append((filename, wer))
    print(f"{filename}: WER = {wer:.3f}")

    # --- Detailed Error Analysis using difflib ---
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
    
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "replace":
            for ref_word, hyp_word in zip(ref_words[i1:i2], hyp_words[j1:j2]):
                sub_counts[(ref_word, hyp_word)] += 1
        elif tag == "delete":
            for ref_word in ref_words[i1:i2]:
                del_counts[ref_word] += 1
        elif tag == "insert":
            for hyp_word in hyp_words[j1:j2]:
                ins_counts[hyp_word] += 1

lawson-steve-20111111-stereo: WER = 0.146
lawson-steve-20111114-stereo: WER = 0.254
hedberg-ken-20110929-stereo: WER = 0.123
roth-jean-starker-20071031: WER = 0.404
lawson-steve-20110919-stereo: WER = 0.220
lawson-steve-20110826-stereo: WER = 0.395
white-charlie-20110518: WER = 0.342
bella-david-20140709: WER = 0.152
block-john-20140805: WER = 0.165
parr-al-20140618: WER = 0.279
robbins-bill-20120327-stereo: WER = 0.208
hedberg-ken-20110920-stereo: WER = 0.181
coleman-ralph-20140708: WER = 0.279
roth-jean-starker-20071113: WER = 0.477
mathews-chris-20110902-stereo-final: WER = 0.152
strauss-steve-20170307: WER = 0.205
hedberg-ken-20110909-stereo: WER = 0.169
hedberg-ken-20111020-stereo: WER = 0.201


In [8]:
# --- Summary and Results ---
if wer_scores:
    avg_wer = sum(score for _, score in wer_scores) / len(wer_scores)
    print(f"\nAverage WER across all files: {avg_wer:.3f}")

print("\nWhispers used: whisper Base") # You can change this manually

print("\nTop 10 Substitution Errors:")
for (ref, hyp), count in sub_counts.most_common(10):
    print(f"  '{ref}' → '{hyp}' ({count} times)")

print("\nTop 10 Insertion Errors:")
for word, count in ins_counts.most_common(10):
    print(f"  '{word}' ({count} times)")

print("\nTop 10 Deletion Errors:")
for word, count in del_counts.most_common(10):
    print(f"  '{word}' ({count} times)")


Average WER across all files: 0.242

Whispers used: whisper Base

Top 10 Substitution Errors:
  'in' → 'and' (233 times)
  'a' → 'the' (140 times)
  'the' → 'a' (112 times)
  'and' → 'in' (103 times)
  'and' → 'and' (75 times)
  'was' → 'is' (69 times)
  'the' → 'the' (64 times)
  'pauling' → 'pauline' (51 times)
  'i' → 'and' (51 times)
  'it' → 'that' (47 times)

Top 10 Insertion Errors:
  'and' (2147 times)
  'you' (700 times)
  'the' (607 times)
  'i' (570 times)
  'know' (570 times)
  'so' (554 times)
  'that' (534 times)
  'a' (435 times)
  'was' (400 times)
  'of' (369 times)

Top 10 Deletion Errors:
  'and' (476 times)
  'of' (237 times)
  'the' (220 times)
  'that' (181 times)
  'a' (176 times)
  'it' (133 times)
  'in' (93 times)
  'i' (92 times)
  'to' (82 times)
  'you' (81 times)


### Whisper tiny

In [9]:
for reference_file in reference_files:
    filename = os.path.splitext(os.path.basename(reference_file))[0]
    asr_file = os.path.join(wt_asr_folder, filename + ".srt")
    
    if not os.path.exists(asr_file):
        print(f"SKIPPING: ASR file missing for {filename}")
        continue

    # --- Get Reference and Hypothesis Text ---
    
    # 1. Reference text from the clean .txt file
    with open(reference_file, "r", encoding="utf-8") as f:
        reference_text = f.read()
    
    # NEW: Remove bracketed text like [laughs] or [unintelligible]
    reference_text = re.sub(r'\[.*?\]', '', reference_text)
    
    reference = normalize(reference_text)

    # 2. Hypothesis text from the ASR's .srt file
    hypothesis_text = extract_text_from_srt(asr_file)
    hypothesis = normalize(hypothesis_text)

    # --- Calculate WER ---
    wer = jiwer.wer(reference, hypothesis)
    wer_scores.append((filename, wer))
    print(f"{filename}: WER = {wer:.3f}")

    # --- Detailed Error Analysis using difflib ---
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
    
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "replace":
            for ref_word, hyp_word in zip(ref_words[i1:i2], hyp_words[j1:j2]):
                sub_counts[(ref_word, hyp_word)] += 1
        elif tag == "delete":
            for ref_word in ref_words[i1:i2]:
                del_counts[ref_word] += 1
        elif tag == "insert":
            for hyp_word in hyp_words[j1:j2]:
                ins_counts[hyp_word] += 1

lawson-steve-20111111-stereo: WER = 0.168
lawson-steve-20111114-stereo: WER = 0.286
hedberg-ken-20110929-stereo: WER = 0.143
roth-jean-starker-20071031: WER = 0.462
lawson-steve-20110919-stereo: WER = 0.229
lawson-steve-20110826-stereo: WER = 0.415
white-charlie-20110518: WER = 0.408
bella-david-20140709: WER = 0.164
block-john-20140805: WER = 0.187
parr-al-20140618: WER = 0.286
robbins-bill-20120327-stereo: WER = 0.231
hedberg-ken-20110920-stereo: WER = 0.221
coleman-ralph-20140708: WER = 0.323
roth-jean-starker-20071113: WER = 0.507
mathews-chris-20110902-stereo-final: WER = 0.178
strauss-steve-20170307: WER = 0.240
hedberg-ken-20110909-stereo: WER = 0.193
hedberg-ken-20111020-stereo: WER = 0.213


In [10]:
# --- Summary and Results ---
if wer_scores:
    avg_wer = sum(score for _, score in wer_scores) / len(wer_scores)
    print(f"\nAverage WER across all files: {avg_wer:.3f}")

print("\nWhispers used: whisper Tiny") # You can change this manually

print("\nTop 10 Substitution Errors:")
for (ref, hyp), count in sub_counts.most_common(10):
    print(f"  '{ref}' → '{hyp}' ({count} times)")

print("\nTop 10 Insertion Errors:")
for word, count in ins_counts.most_common(10):
    print(f"  '{word}' ({count} times)")

print("\nTop 10 Deletion Errors:")
for word, count in del_counts.most_common(10):
    print(f"  '{word}' ({count} times)")


Average WER across all files: 0.256

Whispers used: whisper Tiny

Top 10 Substitution Errors:
  'in' → 'and' (532 times)
  'a' → 'the' (288 times)
  'and' → 'in' (264 times)
  'the' → 'a' (253 times)
  'and' → 'and' (180 times)
  'was' → 'is' (152 times)
  'the' → 'the' (144 times)
  'i' → 'and' (102 times)
  'so' → 'and' (102 times)
  'was' → 'was' (100 times)

Top 10 Insertion Errors:
  'and' (4171 times)
  'you' (1393 times)
  'the' (1162 times)
  'know' (1135 times)
  'i' (1121 times)
  'so' (1085 times)
  'that' (1060 times)
  'a' (869 times)
  'of' (748 times)
  'was' (739 times)

Top 10 Deletion Errors:
  'and' (921 times)
  'the' (455 times)
  'of' (450 times)
  'a' (382 times)
  'that' (371 times)
  'it' (273 times)
  'in' (199 times)
  'i' (182 times)
  'you' (169 times)
  'to' (166 times)


### Faster Whisper Base

In [11]:
for reference_file in reference_files:
    filename = os.path.splitext(os.path.basename(reference_file))[0]
    asr_file = os.path.join(fw_asr_folder, filename + ".srt")
    
    if not os.path.exists(asr_file):
        print(f"SKIPPING: ASR file missing for {filename}")
        continue

    # --- Get Reference and Hypothesis Text ---
    
    # 1. Reference text from the clean .txt file
    with open(reference_file, "r", encoding="utf-8") as f:
        reference_text = f.read()
    
    # NEW: Remove bracketed text like [laughs] or [unintelligible]
    reference_text = re.sub(r'\[.*?\]', '', reference_text)
    
    reference = normalize(reference_text)

    # 2. Hypothesis text from the ASR's .srt file
    hypothesis_text = extract_text_from_srt(asr_file)
    hypothesis = normalize(hypothesis_text)

    # --- Calculate WER ---
    wer = jiwer.wer(reference, hypothesis)
    wer_scores.append((filename, wer))
    print(f"{filename}: WER = {wer:.3f}")

    # --- Detailed Error Analysis using difflib ---
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
    
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "replace":
            for ref_word, hyp_word in zip(ref_words[i1:i2], hyp_words[j1:j2]):
                sub_counts[(ref_word, hyp_word)] += 1
        elif tag == "delete":
            for ref_word in ref_words[i1:i2]:
                del_counts[ref_word] += 1
        elif tag == "insert":
            for hyp_word in hyp_words[j1:j2]:
                ins_counts[hyp_word] += 1

lawson-steve-20111111-stereo: WER = 0.140
lawson-steve-20111114-stereo: WER = 0.250
hedberg-ken-20110929-stereo: WER = 0.117
roth-jean-starker-20071031: WER = 0.397
lawson-steve-20110919-stereo: WER = 0.234
lawson-steve-20110826-stereo: WER = 0.392
white-charlie-20110518: WER = 0.332
bella-david-20140709: WER = 0.126
block-john-20140805: WER = 0.148
parr-al-20140618: WER = 0.245
robbins-bill-20120327-stereo: WER = 0.189
hedberg-ken-20110920-stereo: WER = 0.167
coleman-ralph-20140708: WER = 0.235
roth-jean-starker-20071113: WER = 0.436
mathews-chris-20110902-stereo-final: WER = 0.133
strauss-steve-20170307: WER = 0.217
hedberg-ken-20110909-stereo: WER = 0.160
hedberg-ken-20111020-stereo: WER = 0.181


In [12]:
# --- Summary and Results ---
if wer_scores:
    avg_wer = sum(score for _, score in wer_scores) / len(wer_scores)
    print(f"\nAverage WER across all files: {avg_wer:.3f}")

print("\nWhispers used: Faster Whisper Base") # You can change this manually

print("\nTop 10 Substitution Errors:")
for (ref, hyp), count in sub_counts.most_common(10):
    print(f"  '{ref}' → '{hyp}' ({count} times)")

print("\nTop 10 Insertion Errors:")
for word, count in ins_counts.most_common(10):
    print(f"  '{word}' ({count} times)")

print("\nTop 10 Deletion Errors:")
for word, count in del_counts.most_common(10):
    print(f"  '{word}' ({count} times)")


Average WER across all files: 0.246

Whispers used: Faster Whisper Base

Top 10 Substitution Errors:
  'in' → 'and' (749 times)
  'a' → 'the' (426 times)
  'and' → 'in' (369 times)
  'the' → 'a' (362 times)
  'and' → 'and' (253 times)
  'was' → 'is' (211 times)
  'the' → 'the' (200 times)
  'it' → 'that' (147 times)
  'so' → 'and' (147 times)
  'in' → 'on' (147 times)

Top 10 Insertion Errors:
  'and' (6279 times)
  'you' (2084 times)
  'the' (1729 times)
  'know' (1708 times)
  'i' (1658 times)
  'so' (1623 times)
  'that' (1587 times)
  'a' (1287 times)
  'of' (1118 times)
  'was' (1118 times)

Top 10 Deletion Errors:
  'and' (1373 times)
  'of' (696 times)
  'the' (696 times)
  'that' (572 times)
  'a' (566 times)
  'it' (403 times)
  'in' (303 times)
  'i' (282 times)
  'you' (263 times)
  'to' (249 times)
