In [None]:
import whisper
import torch
import jiwer
import string
from scipy.io import wavfile

# --- 1. Define the list of 30 ground truth sentences ---
# This list is from the NOIZEUS corpus documentation.
GROUND_TRUTH_SENTENCES = [
    "THE BIRCH CANOE SLID ON THE SMOOTH PLANKS",
    "HE KNEW THE SKILL OF THE GREAT YOUNG ACTRESS",
    "HER PURSE WAS FULL OF USELESS TRASH",
    "READ VERSE OUT LOUD FOR PLEASURE",
    "WIPE THE GREASE OFF HIS DIRTY FACE",
    "MEN STRIVE BUT SELDOM GET RICH",
    "WE FIND JOY IN THE SIMPLEST THINGS",
    "HEDGE APPLES MAY STAIN YOUR HANDS GREEN",
    "HURDLE THE PIT WITH THE AID OF A LONG POLE",
    "THE SKY THAT MORNING WAS CLEAR AND BRIGHT BLUE",
    "HE WROTE DOWN A LONG LIST OF ITEMS",
    "THE DRIP OF THE RAIN MADE A PLEASANT SOUND",
    "SMOKE POURED OUT OF EVERY CRACK",
    "HATS ARE WORN TO TEA AND NOT TO DINNER",
    "THE CLOTHES DRIED ON A THIN WOODEN RACK",
    "THE STRAY CAT GAVE BIRTH TO KITTENS",
    "THE LAZY COW LAY IN THE COOL GRASS",
    "THE FRIENDLY GANG LEFT THE DRUG STORE",
    "WE TALKED OF THE SIDESHOW IN THE CIRCUS",
    "THE SET OF CHINA HIT THE FLOOR WITH A CRASH",
    "CLAMS ARE SMALL, ROUND, SOFT AND TASTY",
    "THE LINE WHERE THE EDGES JOIN WAS CLEAN",
    "STOP WHISTLING AND WATCH THE BOYS MARCH",
    "A CRUISE IN WARM WATERS IN A SLEEK YACHT IS FUN",
    "A GOOD BOOK INFORMS OF WHAT WE OUGHT TO KNOW",
    "SHE HAS A SMART WAY OF WEARING CLOTHES",
    "BRING YOUR BEST COMPASS TO THE THIRD CLASS",
    "THE CLUB RENTED THE RINK FOR THE FIFTH NIGHT",
    "THE FLINT SPUTTERED AND LIT A PINE TORCH",
    "LET'S ALL JOIN AS WE SING THE LAST CHORUS"
]

# --- 2. Load one of your noisy audio files ---
# CONFIGURATION - Change these for your test
# This path assumes you run the notebook from the project's root folder.
# The filename `sp01.wav` corresponds to the first sentence.
AUDIO_FILE_PATH = "../data/cleaned/sp01_car_sn5_cleaned.wav" 
SENTENCE_INDEX = 0 # Corresponds to sp01.wav (0-indexed list)
MODEL_SIZE = "base"

# --- 3. Transcribe it using your "Hello Whisper" logic ---
# Check if a CUDA-enabled GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

print(f"Loading Whisper model ('{MODEL_SIZE}')...")
model = whisper.load_model(MODEL_SIZE, device=device)
print("Model loaded successfully.")

print(f"\nStarting transcription of: {AUDIO_FILE_PATH}")
result = model.transcribe(AUDIO_FILE_PATH)
whisper_output_raw = result["text"].strip()
print("Transcription complete.")

# --- 4. Use jiwer to calculate the WER ---
# Get the correct ground truth sentence
ground_truth_raw = GROUND_TRUTH_SENTENCES[SENTENCE_INDEX]

# Normalize both strings for an accurate comparison
# (Convert to uppercase and remove all punctuation)
def normalize_text(text):
    return text.upper().translate(str.maketrans('', '', string.punctuation))

ground_truth = normalize_text(ground_truth_raw)
whisper_output = normalize_text(whisper_output_raw)

print("\n--- PERFORMANCE ANALYSIS ---")
print(f"Ground Truth: {ground_truth}")
print(f"Whisper Output: {whisper_output}")

# Calculate the Word Error Rate
error = jiwer.wer(ground_truth, whisper_output)
print(f"\nWord Error Rate (WER): {error * 100:.2f}%")

Using device: cuda
Loading Whisper model ('base')...
Model loaded successfully.

Starting transcription of: ../data/raw/car/5dB/sp01_car_sn5.wav
Transcription complete.

--- PERFORMANCE ANALYSIS ---
Ground Truth: THE BIRCH CANOE SLID ON THE SMOOTH PLANKS
Whisper Output: THE BIRCH COMMUNITY HAS LED TO A NEW POINT

Word Error Rate (WER): 87.50%
