<a href="https://colab.research.google.com/github/NebaFatima/Speech-to-Text-Transcripter/blob/main/STT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 1. Install dependencies

!pip install -q noisereduce git+https://github.com/openai/whisper.git soundfile librosa
!pip install jiwer


import os
import numpy as np
import soundfile as sf
import librosa
import noisereduce as nr
import whisper
from pathlib import Path
from IPython.display import Audio, display

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.1


In [3]:
# 2. Setting paths

from google.colab import drive
drive.mount('/content/drive')
DATA_DIR = "/content/drive/MyDrive/projects"
clean_dir = Path(DATA_DIR) / "clean"
noisy_dir = Path(DATA_DIR) / "babble_10dB" / "10dB"
output_dir = Path("/content/denoised_outputs")
output_dir.mkdir(exist_ok=True, parents=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# 3. Loading Whisper model

model = whisper.load_model("small")


100%|███████████████████████████████████████| 461M/461M [00:13<00:00, 34.8MiB/s]


In [5]:
# 4. Denoise function

def denoise_audio(y, sr):
    """Denoise with spectral gating (noisereduce)."""
    y = librosa.to_mono(y) if y.ndim > 1 else y
    y = y.astype(np.float32)

    # Estimate noise from first 0.5s
    n_samples = min(len(y), int(0.5 * sr))
    noise_clip = y[:n_samples] if n_samples > 0 else None
    den = nr.reduce_noise(y=y, sr=sr, y_noise=noise_clip)
    return den.astype(np.float32)

In [6]:
# 5. Process and test one file

noisy_path = noisy_dir / "sp01_babble_sn10.wav"
denoised_path = output_dir / "sp01.wav"

# Load noisy audio
noisy_audio, sr = librosa.load(noisy_path, sr=16000)

# Denoise
denoised_audio = denoise_audio(noisy_audio, sr)

# Save denoised audio
sf.write(denoised_path, denoised_audio, sr)

# Listen to noisy vs denoised
print("Noisy File:")
display(Audio(noisy_audio, rate=sr))

print("Denoised File:")
display(Audio(denoised_audio, rate=sr))

Noisy File:


Denoised File:


In [18]:
# 6. Transcribe with Whisper

print("Transcription of denoised audio:")
result = model.transcribe(str(denoised_path),fp16=False)
print(result["text"])

Transcription of denoised audio:
 The birch canoes live on the smooth plains.


In [19]:
# # 7. Batch process all files

# import re

# def natural_key(text):
#     return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', text)]

# def batch_process():

#     files = sorted(noisy_dir.glob("*.wav"), key=lambda x: natural_key(x.name))

#     for file in files:
#         fname = file.name.replace("_babble_sn10", "")
#         out_file = output_dir / fname

#         noisy, sr = librosa.load(file, sr=16000)
#         den = denoise_audio(noisy, sr)
#         sf.write(out_file, den, sr)

#         result = model.transcribe(str(out_file), fp16=False)
#         print(f"{fname} → {result['text']}")

# batch_process()


In [21]:
# # Evaluation: SNR + WER

# from jiwer import wer

# def compute_snr(clean, test):
#     """
#     Compute Signal-to-Noise Ratio (SNR) in dB.
#     clean: reference clean waveform
#     test: noisy or denoised waveform
#     """
#     # Align length
#     min_len = min(len(clean), len(test))
#     clean, test = clean[:min_len], test[:min_len]

#     noise = test - clean
#     snr = 10 * np.log10(np.sum(clean**2) / (np.sum(noise**2) + 1e-10))
#     return snr

# # Example paths (change to match one test case)
# clean_file = "/content/drive/MyDrive/projects/clean/sp23.wav"
# noisy_file = "/content/drive/MyDrive/projects/babble_10dB/10dB/sp23_babble_sn10.wav"
# denoised_file = "/content/denoised_outputs/sp23.wav"

# # Load audios
# clean_audio, sr = librosa.load(clean_file, sr=16000)
# noisy_audio, _ = librosa.load(noisy_file, sr=16000)
# denoised_audio, _ = librosa.load(denoised_file, sr=16000)

# # --- SNR ---
# print("Noisy SNR:", compute_snr(clean_audio, noisy_audio))
# print("Denoised SNR:", compute_snr(clean_audio, denoised_audio))

# # --- WER ---
# # Ground truth (if available for clean speech)
# ground_truth = ground_truth_map.get(fname, "")


# # Get STT for noisy & denoised
# noisy_trans = model.transcribe(noisy_file, fp16=False)["text"].lower()
# denoised_trans = model.transcribe(denoised_file, fp16=False)["text"].lower()

# print("\nGround Truth:", ground_truth)
# print("Noisy Transcription:", noisy_trans)
# print("Denoised Transcription:", denoised_trans)

# print("\nWER (Noisy):", wer(ground_truth, noisy_trans))
# print("WER (Denoised):", wer(ground_truth, denoised_trans))


In [20]:
ground_truth_map = {
    "sp01": "the birch canoes live on the smooth plains",
    "sp02": "he knew the skill of the great young actress",
    "sp03": "the purse is full of useful scratch",
    "sp04": "rebirth out loud for pleasure",
    "sp05": "wipe the grease off the dirty face",
    "sp06": "men strive, but seldom get rich",
    "sp07": "we find joy in the simplest things",
    "sp08": "hedge apples may stain your hands green",
    "sp09": "turtle the pit with the aid of a long pole",
    "sp10": "the sky that morning was clear and bright blue",
    "sp11": "he wrote down a long list of items",
    "sp12": "the drip of the rain made a pleasant sound",
    "sp13": "smoke poured out of every crack",
    "sp14": "pass a warm tea and not to dinner",
    "sp15": "the closed stride on a thin wooden rack",
    "sp16": "the stray cat came first",
    "sp17": "the lazy cow laying the full grass",
    "sp18": "the friendly gang left the drug store",
    "sp19": "we often decide to in a circle",
    "sp20": "instead of china hit the floor with a crash",
    "sp21": "plants are small, round, soft, and thick",
    "sp22": "the line where the edges join with the edge",
    "sp23": "stop whittling and watch the boys march",
    "sp24": "accrued in warm waters in a sleek yacht, fun",
    "sp25": "a good book informs us what we ought to know",
    "sp26": "she has a smart way of wearing clothes",
    "sp27": "bring your best compass to the third class",
    "sp28": "the club rented the ring for the fifth night",
    "sp29": "the slints buttered and lit a pine torch",
    "sp30": "so let's all join as we sing the last chord"
}


In [17]:
import pandas as pd
import re
from jiwer import wer

def natural_key(text):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', text)]

def compute_snr(clean, test):
    min_len = min(len(clean), len(test))
    clean, test = clean[:min_len], test[:min_len]
    noise = test - clean
    return 10 * np.log10(np.sum(clean**2) / (np.sum(noise**2) + 1e-10))

results = []

# loop over files in natural order
files = sorted(noisy_dir.glob("*.wav"), key=lambda x: natural_key(x.name))

for file in files:
    # extract ID (e.g., sp23)
    fname = file.name.replace("_babble_sn10.wav", "")
    clean_file = clean_dir / f"{fname}.wav"
    denoised_file = output_dir / f"{fname}.wav"

    # load audio
    clean_audio, sr = librosa.load(clean_file, sr=16000)
    noisy_audio, _ = librosa.load(file, sr=16000)
    denoised_audio, _ = librosa.load(denoised_file, sr=16000)

    # SNR
    noisy_snr = compute_snr(clean_audio, noisy_audio)
    denoised_snr = compute_snr(clean_audio, denoised_audio)

    # Ground truth
    ground_truth = ground_truth_map.get(fname, "")

    # Transcribe
    noisy_trans = model.transcribe(str(file), fp16=False)["text"].lower()
    denoised_trans = model.transcribe(str(denoised_file), fp16=False)["text"].lower()

    # WER
    noisy_wer = wer(ground_truth, noisy_trans)
    denoised_wer = wer(ground_truth, denoised_trans)

    # Save to results
    results.append({
        "File": fname,
        "Noisy_SNR": noisy_snr,
        "Denoised_SNR": denoised_snr,
        "WER_Noisy": noisy_wer,
        "WER_Denoised": denoised_wer,
        "GT": ground_truth,
        "Noisy_Trans": noisy_trans,
        "Denoised_Trans": denoised_trans
    })

# make dataframe + save
df = pd.DataFrame(results)
df.to_csv("evaluation_results.csv", index=False)
df.to_markdown("evaluation_results.md", index=False)

print(df.head())


   File  Noisy_SNR  Denoised_SNR  WER_Noisy  WER_Denoised  \
0  sp01   9.330283      4.265732   0.250000      0.125000   
1  sp02   9.333097      4.834664   0.111111      0.111111   
2  sp03   9.450253      5.605457   0.142857      0.142857   
3  sp04   9.539553      5.607015   0.600000      0.200000   
4  sp05   9.321819      5.829581   0.428571      0.142857   

                                             GT  \
0    the birch canoes live on the smooth plains   
1  he knew the skill of the great young actress   
2           the purse is full of useful scratch   
3                 rebirth out loud for pleasure   
4            wipe the grease off the dirty face   

                                      Noisy_Trans  \
0      the birch canoes slid on the smooth flank.   
1   he knew the skill of the great young actress.   
2            the purse is full of useful scratch.   
3              read first, out loud for pleasure.   
4            wipes the grease off his dirty face.   

       