In [23]:
import os
import re
import string
import difflib
from glob import glob
from collections import Counter
import contractions

import jiwer
import nltk 
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nishithreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# --- Configuration ---
# Folders for reference (.txt) and ASR output (.srt)
reference_folder = "transcripts/original_transcripts" 
fw_asr_folder = "transcripts/faster_whisper_transcripts"
wb_asr_folder = "transcripts/whisper_base_transcripts"
wt_asr_folder = "transcripts/whisper_tiny_transcripts"



In [25]:
# Find all the reference text files
reference_files = glob(os.path.join(reference_folder, "*.txt"))


In [26]:
FILLER_WORDS = set([
    'uh', 'um', 'oh', 'ah', 'er', 'mm', 'hm', 'mmhmm','hmm', 'mhm', 'mmhm', 'uhhuh',
    'yeah', 'yes', 'no', 'ok', 'okay', 'well', 'like', 'so', 'right',
    'you know', 'i mean', '–', 'know', 'think', 'really', 'kind', 'mean', 'sort', 'said'
])

NUMBER_MAP = {
    'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5',
    'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10',
    'eleven': '11', 'twelve': '12', 'thirteen': '13', 'fourteen': '14',
    'fifteen': '15', 'sixteen': '16', 'seventeen': '17', 'eighteen': '18',
    'nineteen': '19', 'twenty': '20', 'thirty': '30', 'forty': '40',
    'fifty': '50', 'sixty': '60', 'seventy': '70', 'eighty': '80', 'ninety': '90',
    'hundred': '100', 'thousand': '1000'
}

def normalize(text):
    """Lowercase, remove punctuation, and collapse whitespace."""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    try:
        text = contractions.fix(text)
    except Exception:
        pass # Ignore errors on weird inputs
    
    try:
        words = text.split()
        new_words = []
        for word in words:
            # If word is in our map, replace it. Otherwise, keep it.
            new_words.append(NUMBER_MAP.get(word, word))
        text = " ".join(new_words)
    except Exception:
        pass # Ignore errors    # NEW: Remove filler words
    # This is a simple way; a more robust way would use regex word boundaries
    words = text.split()
    non_filler_words = [word for word in words if word not in FILLER_WORDS]
    text = " ".join(non_filler_words)
    
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [27]:
def extract_text_from_srt(srt_path):
    """Extracts only the dialogue text from an SRT subtitle file."""
    with open(srt_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    
    text_lines = []
    # Regex to identify timestamp lines like "00:00:01,234 --> 00:00:05,678"
    timestamp_re = re.compile(r'\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}')
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if re.match(r'^\d+$', line):  # Skip subtitle number lines
            continue
        if timestamp_re.match(line): # Skip timestamp lines
            continue
        text_lines.append(line)
        
    return " ".join(text_lines)

In [28]:

# --- Accumulators for error analysis ---
wer_scores = []
sub_counts = Counter()   # (reference_word, hypothesis_word) -> count
ins_counts = Counter()   # inserted_word -> count
del_counts = Counter()   # deleted_word -> count

# --- Counters for "meaningful" errors ---
# We'll use these to store errors that DON'T involve a stopword
meaningful_sub_counts = Counter()
meaningful_ins_counts = Counter()
meaningful_del_counts = Counter()

stop_words = set(nltk.corpus.stopwords.words('english'))

### Whisper Base

In [29]:
for reference_file in reference_files:
    filename = os.path.splitext(os.path.basename(reference_file))[0]
    asr_file = os.path.join(wb_asr_folder, filename + ".srt")
    
    if not os.path.exists(asr_file):
        print(f"SKIPPING: ASR file missing for {filename}")
        continue

    # --- Get Reference and Hypothesis Text ---
    
    # 1. Reference text from the clean .txt file
    with open(reference_file, "r", encoding="utf-8") as f:
        reference_text = f.read()
    
    # NEW: Remove bracketed text like [laughs] or [unintelligible]
    reference_text = re.sub(r'\[.*?\]', '', reference_text)
    
    reference = normalize(reference_text)

    # 2. Hypothesis text from the ASR's .srt file
    hypothesis_text = extract_text_from_srt(asr_file)
    hypothesis = normalize(hypothesis_text)

    # --- Calculate WER ---
    wer = jiwer.wer(reference, hypothesis)
    wer_scores.append((filename, wer))
    print(f"{filename}: WER = {wer:.3f}")

    # --- Detailed Error Analysis using difflib ---
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
    
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "replace":
            for ref_word, hyp_word in zip(ref_words[i1:i2], hyp_words[j1:j2]):
                # 1. Update total counts
                sub_counts[(ref_word, hyp_word)] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if at least one of the words is NOT a stopword
                if ref_word not in stop_words or hyp_word not in stop_words:
                    meaningful_sub_counts[(ref_word, hyp_word)] += 1
                    
        elif tag == "delete":
            for ref_word in ref_words[i1:i2]:
                # 1. Update total counts
                del_counts[ref_word] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if the deleted word is NOT a stopword
                if ref_word not in stop_words:
                    meaningful_del_counts[ref_word] += 1
                    
        elif tag == "insert":
            for hyp_word in hyp_words[j1:j2]:
                # 1. Update total counts
                ins_counts[hyp_word] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if the inserted word is NOT a stopword
                if hyp_word not in stop_words:
                    meaningful_ins_counts[hyp_word] += 1

lawson-steve-20111111-stereo: WER = 0.138
lawson-steve-20111114-stereo: WER = 0.242
hedberg-ken-20110929-stereo: WER = 0.117
roth-jean-starker-20071031: WER = 0.378
lawson-steve-20110919-stereo: WER = 0.215
lawson-steve-20110826-stereo: WER = 0.383
white-charlie-20110518: WER = 0.297
bella-david-20140709: WER = 0.143
block-john-20140805: WER = 0.162
parr-al-20140618: WER = 0.265
robbins-bill-20120327-stereo: WER = 0.194
hedberg-ken-20110920-stereo: WER = 0.177
coleman-ralph-20140708: WER = 0.250
roth-jean-starker-20071113: WER = 0.446
mathews-chris-20110902-stereo-final: WER = 0.147
strauss-steve-20170307: WER = 0.176
hedberg-ken-20110909-stereo: WER = 0.162
hedberg-ken-20111020-stereo: WER = 0.176


In [30]:
# --- Summary and Results ---
if wer_scores:
    avg_wer = sum(score for _, score in wer_scores) / len(wer_scores)
    print(f"\nAverage WER across all files: {avg_wer:.3f}")

print("\nWhispers used: whisper Base") # You can change this manually

print("\nTop 10 Meaningful Word Substitution Errors:")
for (ref, hyp), count in meaningful_sub_counts.most_common(10):
    print(f"  '{ref}' → '{hyp}' ({count} times)")

print("\nTop 10 Meaningful Word Insertion Errors:")
for word, count in meaningful_ins_counts.most_common(10):
    print(f"  '{word}' ({count} times)")

print("\nTop 10 Meaningful Word Deletion Errors:")
for word, count in meaningful_del_counts.most_common(10):
    print(f"  '{word}' ({count} times)")


Average WER across all files: 0.226

Whispers used: whisper Base

Top 10 Meaningful Word Substitution Errors:
  'pauling' → 'pauline' (50 times)
  'id' → 'i' (38 times)
  'pauling' → 'pong' (33 times)
  'wed' → 'we' (31 times)
  'lise' → 'lisa' (29 times)
  'rath' → 'rat' (21 times)
  'can' → 'could' (21 times)
  'i' → 'id' (20 times)
  'pauling' → 'plain' (19 times)
  'could' → 'can' (19 times)

Top 10 Meaningful Word Insertion Errors:
  'would' (74 times)
  'time' (64 times)
  'good' (48 times)
  '1' (36 times)
  'course' (32 times)
  'say' (32 times)
  'also' (30 times)
  'could' (30 times)
  'lot' (29 times)
  'little' (28 times)

Top 10 Meaningful Word Deletion Errors:
  'would' (52 times)
  'liked' (14 times)
  'time' (12 times)
  'could' (9 times)
  'says' (8 times)
  'little' (8 times)
  'us' (8 times)
  '1' (7 times)
  'interesting' (6 times)
  'went' (6 times)


### Whisper tiny

In [31]:
# --- Accumulators for error analysis ---
wer_scores = []
sub_counts = Counter()   # (reference_word, hypothesis_word) -> count
ins_counts = Counter()   # inserted_word -> count
del_counts = Counter()   # deleted_word -> count

# --- Counters for "meaningful" errors ---
# We'll use these to store errors that DON'T involve a stopword
meaningful_sub_counts = Counter()
meaningful_ins_counts = Counter()
meaningful_del_counts = Counter()

In [32]:
for reference_file in reference_files:
    filename = os.path.splitext(os.path.basename(reference_file))[0]
    asr_file = os.path.join(wt_asr_folder, filename + ".srt")
    
    if not os.path.exists(asr_file):
        print(f"SKIPPING: ASR file missing for {filename}")
        continue

    # --- Get Reference and Hypothesis Text ---
    
    # 1. Reference text from the clean .txt file
    with open(reference_file, "r", encoding="utf-8") as f:
        reference_text = f.read()
    
    # NEW: Remove bracketed text like [laughs] or [unintelligible]
    reference_text = re.sub(r'\[.*?\]', '', reference_text)
    
    reference = normalize(reference_text)

    # 2. Hypothesis text from the ASR's .srt file
    hypothesis_text = extract_text_from_srt(asr_file)
    hypothesis = normalize(hypothesis_text)

    # --- Calculate WER ---
    wer = jiwer.wer(reference, hypothesis)
    wer_scores.append((filename, wer))
    print(f"{filename}: WER = {wer:.3f}")

   # --- Detailed Error Analysis using difflib ---
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
    
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "replace":
            for ref_word, hyp_word in zip(ref_words[i1:i2], hyp_words[j1:j2]):
                # 1. Update total counts
                sub_counts[(ref_word, hyp_word)] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if at least one of the words is NOT a stopword
                if ref_word not in stop_words or hyp_word not in stop_words:
                    meaningful_sub_counts[(ref_word, hyp_word)] += 1
                    
        elif tag == "delete":
            for ref_word in ref_words[i1:i2]:
                # 1. Update total counts
                del_counts[ref_word] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if the deleted word is NOT a stopword
                if ref_word not in stop_words:
                    meaningful_del_counts[ref_word] += 1
                    
        elif tag == "insert":
            for hyp_word in hyp_words[j1:j2]:
                # 1. Update total counts
                ins_counts[hyp_word] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if the inserted word is NOT a stopword
                if hyp_word not in stop_words:
                    meaningful_ins_counts[hyp_word] += 1

lawson-steve-20111111-stereo: WER = 0.160
lawson-steve-20111114-stereo: WER = 0.274
hedberg-ken-20110929-stereo: WER = 0.137
roth-jean-starker-20071031: WER = 0.413
lawson-steve-20110919-stereo: WER = 0.225
lawson-steve-20110826-stereo: WER = 0.404
white-charlie-20110518: WER = 0.358
bella-david-20140709: WER = 0.157
block-john-20140805: WER = 0.185
parr-al-20140618: WER = 0.281
robbins-bill-20120327-stereo: WER = 0.216
hedberg-ken-20110920-stereo: WER = 0.218
coleman-ralph-20140708: WER = 0.288
roth-jean-starker-20071113: WER = 0.459
mathews-chris-20110902-stereo-final: WER = 0.174
strauss-steve-20170307: WER = 0.211
hedberg-ken-20110909-stereo: WER = 0.186
hedberg-ken-20111020-stereo: WER = 0.203


In [33]:
# --- Summary and Results ---
if wer_scores:
    avg_wer = sum(score for _, score in wer_scores) / len(wer_scores)
    print(f"\nAverage WER across all files: {avg_wer:.3f}")

print("\nWhispers used: whisper Tiny") # You can change this manually

print("\nTop 10 Meaningful Word Substitution Errors:")
for (ref, hyp), count in meaningful_sub_counts.most_common(10):
    print(f"  '{ref}' → '{hyp}' ({count} times)")

print("\nTop 10 Meaningful Word Insertion Errors:")
for word, count in meaningful_ins_counts.most_common(10):
    print(f"  '{word}' ({count} times)")

print("\nTop 10 Meaningful Word Deletion Errors:")
for word, count in meaningful_del_counts.most_common(10):
    print(f"  '{word}' ({count} times)")


Average WER across all files: 0.253

Whispers used: whisper Tiny

Top 10 Meaningful Word Substitution Errors:
  'id' → 'i' (46 times)
  'wed' → 'we' (35 times)
  'pauling' → 'pong' (31 times)
  'pauling' → 'poly' (27 times)
  'i' → 'id' (24 times)
  'can' → 'could' (24 times)
  'lise' → 'lisa' (23 times)
  'pauling' → 'is' (22 times)
  'he' → 'hes' (20 times)
  'had' → 'would' (19 times)

Top 10 Meaningful Word Insertion Errors:
  'would' (61 times)
  'time' (60 times)
  'good' (44 times)
  '1' (38 times)
  'course' (36 times)
  'could' (30 times)
  'also' (29 times)
  'say' (27 times)
  'little' (26 times)
  'probably' (24 times)

Top 10 Meaningful Word Deletion Errors:
  'would' (50 times)
  'liked' (17 times)
  '1' (10 times)
  'time' (10 times)
  'little' (9 times)
  'go' (9 times)
  'interesting' (7 times)
  'could' (7 times)
  'say' (7 times)
  'talk' (6 times)


### Faster Whisper Base

In [34]:
# --- Accumulators for error analysis ---
wer_scores = []
sub_counts = Counter()   # (reference_word, hypothesis_word) -> count
ins_counts = Counter()   # inserted_word -> count
del_counts = Counter()   # deleted_word -> count

# --- Counters for "meaningful" errors ---
# We'll use these to store errors that DON'T involve a stopword
meaningful_sub_counts = Counter()
meaningful_ins_counts = Counter()
meaningful_del_counts = Counter()

In [35]:
for reference_file in reference_files:
    filename = os.path.splitext(os.path.basename(reference_file))[0]
    asr_file = os.path.join(fw_asr_folder, filename + ".srt")
    
    if not os.path.exists(asr_file):
        print(f"SKIPPING: ASR file missing for {filename}")
        continue

    # --- Get Reference and Hypothesis Text ---
    
    # 1. Reference text from the clean .txt file
    with open(reference_file, "r", encoding="utf-8") as f:
        reference_text = f.read()
    
    # NEW: Remove bracketed text like [laughs] or [unintelligible]
    reference_text = re.sub(r'\[.*?\]', '', reference_text)
    
    reference = normalize(reference_text)

    # 2. Hypothesis text from the ASR's .srt file
    hypothesis_text = extract_text_from_srt(asr_file)
    hypothesis = normalize(hypothesis_text)

    # --- Calculate WER ---
    wer = jiwer.wer(reference, hypothesis)
    wer_scores.append((filename, wer))
    print(f"{filename}: WER = {wer:.3f}")

 # --- Detailed Error Analysis using difflib ---
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
    
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "replace":
            for ref_word, hyp_word in zip(ref_words[i1:i2], hyp_words[j1:j2]):
                # 1. Update total counts
                sub_counts[(ref_word, hyp_word)] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if at least one of the words is NOT a stopword
                if ref_word not in stop_words or hyp_word not in stop_words:
                    meaningful_sub_counts[(ref_word, hyp_word)] += 1
                    
        elif tag == "delete":
            for ref_word in ref_words[i1:i2]:
                # 1. Update total counts
                del_counts[ref_word] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if the deleted word is NOT a stopword
                if ref_word not in stop_words:
                    meaningful_del_counts[ref_word] += 1
                    
        elif tag == "insert":
            for hyp_word in hyp_words[j1:j2]:
                # 1. Update total counts
                ins_counts[hyp_word] += 1
                
                # 2. NEW: Update meaningful counts
                # Only count if the inserted word is NOT a stopword
                if hyp_word not in stop_words:
                    meaningful_ins_counts[hyp_word] += 1

lawson-steve-20111111-stereo: WER = 0.131
lawson-steve-20111114-stereo: WER = 0.238
hedberg-ken-20110929-stereo: WER = 0.111
roth-jean-starker-20071031: WER = 0.369
lawson-steve-20110919-stereo: WER = 0.230
lawson-steve-20110826-stereo: WER = 0.380
white-charlie-20110518: WER = 0.284
bella-david-20140709: WER = 0.117
block-john-20140805: WER = 0.145
parr-al-20140618: WER = 0.239
robbins-bill-20120327-stereo: WER = 0.176
hedberg-ken-20110920-stereo: WER = 0.162
coleman-ralph-20140708: WER = 0.216
roth-jean-starker-20071113: WER = 0.410
mathews-chris-20110902-stereo-final: WER = 0.129
strauss-steve-20170307: WER = 0.184
hedberg-ken-20110909-stereo: WER = 0.153
hedberg-ken-20111020-stereo: WER = 0.175


In [36]:
# --- Summary and Results ---
if wer_scores:
    avg_wer = sum(score for _, score in wer_scores) / len(wer_scores)
    print(f"\nAverage WER across all files: {avg_wer:.3f}")

print("\nWhispers used: Faster Whisper Base") # You can change this manually

print("\nTop 10 Meaningful Word Substitution Errors:")
for (ref, hyp), count in meaningful_sub_counts.most_common(10):
    print(f"  '{ref}' → '{hyp}' ({count} times)")

print("\nTop 10 Meaningful Word Insertion Errors:")
for word, count in meaningful_ins_counts.most_common(10):
    print(f"  '{word}' ({count} times)")

print("\nTop 10 Meaningful Word Deletion Errors:")
for word, count in meaningful_del_counts.most_common(10):
    print(f"  '{word}' ({count} times)")


Average WER across all files: 0.214

Whispers used: Faster Whisper Base

Top 10 Meaningful Word Substitution Errors:
  'pauling' → 'pauline' (82 times)
  'id' → 'i' (42 times)
  'lise' → 'lisa' (27 times)
  'i' → 'id' (25 times)
  'pauling' → 'pond' (24 times)
  'wed' → 'we' (23 times)
  'paulings' → 'paulines' (22 times)
  'could' → 'can' (21 times)
  'rath' → 'rat' (21 times)
  'pauling' → 'pong' (18 times)

Top 10 Meaningful Word Insertion Errors:
  'would' (67 times)
  'time' (63 times)
  'good' (49 times)
  'course' (34 times)
  '1' (33 times)
  'little' (31 times)
  'also' (28 times)
  'could' (28 times)
  'got' (28 times)
  'sure' (27 times)

Top 10 Meaningful Word Deletion Errors:
  'would' (51 times)
  'little' (14 times)
  '1' (11 times)
  'says' (10 times)
  'work' (9 times)
  'time' (9 times)
  'liked' (8 times)
  'sure' (8 times)
  'course' (7 times)
  'could' (7 times)
