In [1]:
# Twi ASR with N-best Rescoring using a 3-gram Kneser–Ney LM
#

!pip -q uninstall -y transformers
!pip -q install transformers accelerate librosa pandas numpy nltk
!apt-get -y install ffmpeg

import os
import re
import math
import pickle
from typing import List, Tuple

import numpy as np
import pandas as pd
import torch
import librosa
from google.colab import drive

from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq


# data path
drive.mount("/content/drive")
AUDIO_DIR = "/content/drive/MyDrive/twi audio"
MANIFEST_PATH = "/content/drive/MyDrive/twi_audios_manifest.csv"
LM_PATH = "/content/drive/MyDrive/twi_kneser_ney_3gram.pkl"

OUT_PATH = "/content/drive/MyDrive/twi_asr_results_10_nbest.csv"

MODEL_ID = "zirri23/whisper-akan-finetuned"

AUDIO_EXTS = (".wav", ".mp3", ".m4a", ".flac", ".ogg")

# N-best settings
N_BEST = 5
TEMPERATURE = 0.7
TOP_P = 0.9
BEAM_WIDTH = 10


# Loading LM
with open(LM_PATH, "rb") as f:
    lm = pickle.load(f)

print("LM loaded:", type(lm))
print("Order:", lm.order)
print("Vocab size:", len(lm.vocab))


# Loading the Whisper Model
device = "cuda" if torch.cuda.is_available() else "cpu"

from transformers import (
    WhisperProcessor,
    AutoModelForSpeechSeq2Seq
)

processor = WhisperProcessor.from_pretrained(MODEL_ID)
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID).to(device)
model.eval()


print(f"Loaded Whisper model: {MODEL_ID} on {device}")


# Text normalization
def normalize_text(text: str) -> str:
    text = text.lower()
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')
    text = re.sub(r"[^a-z0-9ɛɔ'\s-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text: str) -> List[str]:
    text = normalize_text(text)
    toks = []
    for tok in text.split():
        toks.extend(tok.split("-"))
    return [t for t in toks if t]


# Metrics
def edit_distance(a, b):
    dp = [[0]*(len(b)+1) for _ in range(len(a)+1)]
    for i in range(len(a)+1): dp[i][0] = i
    for j in range(len(b)+1): dp[0][j] = j
    for i in range(1, len(a)+1):
        for j in range(1, len(b)+1):
            cost = 0 if a[i-1] == b[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,
                dp[i][j-1] + 1,
                dp[i-1][j-1] + cost
            )
    return dp[-1][-1]

def wer(ref: str, hyp: str) -> float:
    r = tokenize(ref)
    h = tokenize(hyp)
    return edit_distance(r, h) / max(1, len(r))

def cer(ref: str, hyp: str) -> float:
    r = normalize_text(ref).replace(" ", "")
    h = normalize_text(hyp).replace(" ", "")
    return edit_distance(list(r), list(h)) / max(1, len(r))


# LM helper functions
def vocab_lookup(token: str) -> str:
    return lm.vocab.lookup([token])[0]

def twi_variants(word: str) -> List[str]:
    w = word.lower()
    variants = {w}
    swaps = [("e","ɛ"), ("ɛ","e"), ("o","ɔ"), ("ɔ","o")]

    for a,b in swaps:
        if a in w:
            variants.add(w.replace(a,b))

    expanded = set(variants)
    for v in list(variants):
        for a,b in swaps:
            if a in v:
                expanded.add(v.replace(a,b))

    out = []
    for v in expanded:
        if v == w or v in lm.vocab:
            out.append(v)
    return list(dict.fromkeys(out))

def correct_transcript_beam(text: str, beam_width: int = 10) -> Tuple[str, float]:
    tokens = tokenize(text)
    if not tokens:
        return "", 0.0

    order = lm.order
    beams = [(0.0, [], ["<s>"]*(order-1))]

    for w in tokens:
        new_beams = []
        for logp, out, ctx in beams:
            context = tuple(ctx[-(order-1):])
            for cand in twi_variants(w):
                cand_lm = vocab_lookup(cand)
                p = lm.score(cand_lm, context)
                add = -50.0 if p <= 0 else math.log(p)
                new_beams.append(
                    (logp+add, out+[cand], (ctx+[cand_lm])[-(order-1):])
                )
        new_beams.sort(key=lambda x: x[0], reverse=True)
        beams = new_beams[:beam_width]

    best_logp, best_out, best_ctx = beams[0]
    end_p = lm.score("</s>", tuple(best_ctx))
    best_logp += (-50.0 if end_p <= 0 else math.log(end_p))
    return " ".join(best_out), best_logp


# N-best transcription
def transcribe_twi_nbest(audio_path: str, n_best: int = 5):
    y, sr = librosa.load(audio_path, sr=16000)
    inputs = processor(y, sampling_rate=16000, return_tensors="pt")
    feats = inputs.input_features.to(device)

    with torch.no_grad():
        ids = model.generate(
            feats,
            task="transcribe",
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            num_return_sequences=n_best
        )

    texts = processor.batch_decode(ids, skip_special_tokens=True)
    return list(dict.fromkeys([t.strip() for t in texts if t.strip()]))

def pick_best_with_lm(candidates):
    scored = []
    for c in candidates:
        fixed, score = correct_transcript_beam(c, BEAM_WIDTH)
        scored.append((c, fixed, score))
    scored.sort(key=lambda x: x[2], reverse=True)
    return scored[0], scored


# data loading
audio_files = sorted([
    os.path.join(AUDIO_DIR, f)
    for f in os.listdir(AUDIO_DIR)
    if f.lower().endswith(AUDIO_EXTS)
])

manifest = pd.read_csv(MANIFEST_PATH)
manifest["audio_name"] = manifest["audio_path"].apply(os.path.basename)
ref_map = dict(zip(manifest["audio_name"], manifest["sentence"]))


# Evaluation
rows = []

for i, path in enumerate(audio_files, 1):
    name = os.path.basename(path)
    if name not in ref_map:
        continue

    ref = ref_map[name]
    print(f"[{i}/{len(audio_files)}] {name}")

    candidates = transcribe_twi_nbest(path, N_BEST)
    (best_raw, best_fixed, best_score), scored = pick_best_with_lm(candidates)

    rows.append({
        "audio": name,
        "ref": ref,
        "best_raw": best_raw,
        "best_fixed": best_fixed,
        "wer_raw": wer(ref, best_raw),
        "wer_fixed": wer(ref, best_fixed),
        "cer_raw": cer(ref, best_raw),
        "cer_fixed": cer(ref, best_fixed),
        "lm_logscore": best_score
    })

df = pd.DataFrame(rows)
display(df)

print("\nAverage WER raw  :", df["wer_raw"].mean())
print("Average WER fixed:", df["wer_fixed"].mean())
print("ΔWER:", df["wer_raw"].mean() - df["wer_fixed"].mean())

print("\nAverage CER raw  :", df["cer_raw"].mean())
print("Average CER fixed:", df["cer_fixed"].mean())

df.to_csv(OUT_PATH, index=False)
print("\nSaved:", OUT_PATH)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Mounted at /content/drive
LM loaded: <class 'nltk.lm.models.KneserNeyInterpolated'>
Order: 3
Vocab size: 2986


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/479 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/104M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/384 [00:00<?, ?it/s]

Loaded Whisper model: zirri23/whisper-akan-finetuned on cuda
[1/10] common_voice_tw_34745954.mp3


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensA

[2/10] common_voice_tw_34997393.mp3
[3/10] common_voice_tw_34997394.mp3
[4/10] common_voice_tw_34997398.mp3
[5/10] common_voice_tw_34997400.mp3
[6/10] common_voice_tw_34997402.mp3
[7/10] common_voice_tw_35280404.mp3
[8/10] common_voice_tw_35280405.mp3
[9/10] common_voice_tw_35280406.mp3
[10/10] common_voice_tw_35280407.mp3


Unnamed: 0,audio,ref,best_raw,best_fixed,wer_raw,wer_fixed,cer_raw,cer_fixed,lm_logscore
0,common_voice_tw_34745954.mp3,"Dabi, ɛnte saa",dɛɛbi nte saa,dɛɛbi nte saa,0.666667,0.666667,0.272727,0.272727,-61.332504
1,common_voice_tw_34997393.mp3,• Ma wo yere nhu sɛ wopene nufuma so.,maowere ho sɛ ɔpne ne fam so,maowere ho sɛ ɔpne ne fam so,0.875,0.875,0.37037,0.37037,-129.372895
2,common_voice_tw_34997394.mp3,Dɛn na ɛno bɛkyerɛ?,den na ɛno bɛkyerɛ,dɛn na ɛno bɛkyerɛ,0.25,0.0,0.066667,0.0,-17.925918
3,common_voice_tw_34997398.mp3,Ɛkaa abɔde ho nsɛm pii a ɛma yenya Yehowa dɔ h...,ɛka abɔde ho sɛn pii a mma yɛnyɛ yi ho adɔ ho ...,ɛka abɔde ho sen pii a mma yɛnyɛ yi ho adɔ hɔ ...,0.692308,0.769231,0.23913,0.282609,-130.497312
4,common_voice_tw_34997400.mp3,Mekae saa asɛmfua yi.,mikaisa asemfuo yi,mikaisa asemfuo yi,0.75,0.75,0.294118,0.294118,-109.392121
5,common_voice_tw_34997402.mp3,Dɛn na wopɛ sɛ wunya?,dan na ɔpɛ sɛ wonyɛ,dan na ɔpɛ sɛ wɔnyɛ,0.6,0.6,0.3125,0.3125,-40.380903
6,common_voice_tw_35280404.mp3,Ɛyɛɛ no nwonwa ma obisaa me sɛ,ɛyɛ no wɔn wɔn ma obisa ameɛ,ɛyɛ no wɔn wɔn ma obisa ameɛ,0.857143,0.857143,0.25,0.25,-146.199786
7,common_voice_tw_35280405.mp3,Sɛnea Asamoah gye di sɛ biribiara wɔ ne bere n...,sɛ nea asɛm agyi de sɛ beebiara wɔ ne berɛ ne bɛɛ,sɛ nea asɛm agyi dɛ sɛ beebiara wɔ ne bere ne bɛɛ,0.727273,0.636364,0.302326,0.27907,-169.596883
8,common_voice_tw_35280406.mp3,Mekyerɛkyerɛɛ mu sɛ Yehowa Adansefo nso hyɛ ey...,mkykyerkyer mu sɛ hɔ wɔ adansafoɔ nso hyɛ yi h...,mkykyerkyer mu se ho wɔ adansafoɔ nso hyɛ yi h...,0.6,0.7,0.3125,0.291667,-211.78505
9,common_voice_tw_35280407.mp3,"Bere a midii awia aduan wiei no, mesan kɔɔ adw...",brɔ a ɛdi ɛwiaduan wien ɛsan kɔ adwumayɛ bea hɔ,bro a edi ɛwiaduan wien ɛsan kɔ adwumayɛ bea hɔ,0.818182,0.818182,0.26087,0.26087,-170.382155



Average WER raw  : 0.6836571761571761
Average WER fixed: 0.6672585747585748
ΔWER: 0.016398601398601387

Average CER raw  : 0.2681207538218482
Average CER fixed: 0.2613929985134559

Saved: /content/drive/MyDrive/twi_asr_results_10_nbest.csv
