<a href="https://colab.research.google.com/github/MarkKMensah/ASR-app/blob/main/whisper_small_Akan_standardspeech_spec_and_audio_augmentTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# === STEP 0: Install Required Packages ===
!pip install -q jiwer transformers torchaudio librosa tqdm pandas openpyxl huggingface_hub

# === STEP 0.5: Authenticate with Hugging Face ===
from huggingface_hub import login
login()  # 🔑 Paste your token when prompted


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
# === STEP 1: Mount Google Drive ===
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# === STEP 2: Extract ZIP from Google Drive ===
import zipfile, os

# Update path if needed
drive_zip_path = "/content/drive/MyDrive/audios.zip"  # Make sure this is correct
extract_dir = "/content/asr_test_data"

# Extract ZIP
with zipfile.ZipFile(drive_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Confirm contents
audio_dir = os.path.join(extract_dir, "audios")
metadata_path = os.path.join(audio_dir, "metadata.xlsx")
print("✅ Extracted files:", os.listdir(audio_dir))


✅ Extracted files: ['ak_gh_image_0803_u123_1_1688939334131_00974.mp3', 'ak_gh_image_0072_u133_1_1688920839618_12823.mp3', 'metadata.xlsx', '~$selected transcribed audios (1).xlsx', 'ak_gh_image_0024_u133_1_1688938029258_03229.mp3', 'ak_gh_image_0018_u119_1_1688933121338_15873.mp3', 'ak_gh_image_0093_u133_1_1688921086009_06529.mp3', '~$metadata.xlsx']


In [11]:
# === STEP 3: Load and Clean Metadata ===
import pandas as pd

metadata_df = pd.read_excel(metadata_path)
metadata_df['file_name'] = metadata_df['AUDIO_PATH'].apply(
    lambda x: os.path.basename(str(x).replace("\\", "/")).strip()
)
metadata_df['Transcriptions'] = metadata_df['Transcriptions'].astype(str).str.strip()


In [12]:
# === STEP 4: Load Whisper-Small Akan Model and Standard Processor ===
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained(
    "cdli/whisper-small_Akan_standardspeech_spec_and_audio_augment"
)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [13]:
# === STEP 5: Transcribe and Evaluate WER ===
import librosa
from jiwer import wer
from tqdm import tqdm

test_files = [f for f in os.listdir(audio_dir) if f.endswith(".mp3")][:5]
results = []

for file_name in tqdm(test_files, desc="🔁 Processing audio files"):
    audio_path = os.path.join(audio_dir, file_name)

    waveform, sample_rate = librosa.load(audio_path, sr=16000)
    waveform = torch.tensor(waveform).unsqueeze(0)

    inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
    with torch.no_grad():
        predicted_ids = model.generate(**inputs)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()

    ground_row = metadata_df[metadata_df['file_name'] == file_name]
    if ground_row.empty:
        print(f"⚠️ Ground truth not found for {file_name}")
        continue
    ground_truth = ground_row['Transcriptions'].values[0].strip()

    error = wer(ground_truth.lower(), transcription.lower())

    results.append({
        "Audio File": file_name,
        "Ground Truth": ground_truth,
        "Model Transcript": transcription,
        "WER": round(error, 3)
    })

# Display results
results_df = pd.DataFrame(results)
from IPython.display import display
display(results_df)

# Save results
csv_output = "/content/asr_small_model_results.csv"
results_df.to_csv(csv_output, index=False)
print(f"✅ Results saved to {csv_output}")


🔁 Processing audio files:   0%|          | 0/5 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
🔁 Processing audio files: 100%|██████████| 5/5 [01:54<00:00, 22.87s/it]


Unnamed: 0,Audio File,Ground Truth,Model Transcript,WER
0,ak_gh_image_0803_u123_1_1688939334131_00974.mp3,Abranteɛ bi gyina dwam. Na ɔhyɛ ataade soro. N...,"Aberanteɛ bi gyina dwam, na ɔhyɛ atade soro, n...",0.2
1,ak_gh_image_0072_u133_1_1688920839618_12823.mp3,"Nkorɔfoɔ bi gyina po ano, wɔn mu dodoɔ no ara ...",Nkurɔfoɔ bi gyina po ano. Wɔn mu dodoɔ no ara ...,0.243
2,ak_gh_image_0024_u133_1_1688938029258_03229.mp3,Buroni akwadaa bi redi agorɔ. Ɔde label ahyehy...,Buroni akwadaa bi redi agorɔ. Ɔde label ahyehy...,0.061
3,ak_gh_image_0018_u119_1_1688933121338_15873.mp3,"Akwanhyia bi ɛsi wɔ akwantinfi na ɛfiri ɛbien,...","Akwanhyia bi asi wɔ akwantinfi, na afiri abien...",0.371
4,ak_gh_image_0093_u133_1_1688921086009_06529.mp3,Aha yɛ kwan ho. Nhyɛn ahodoɔ redi akɔneaba. Kw...,Aha yɛ kwan ho. Nhyɛn ahodoɔ redi akɔneaba. Kw...,0.162


✅ Results saved to /content/asr_small_model_results.csv


In [14]:
# Save to MyDrive root (adjust if needed)
!cp /content/asr_small_model_results.csv /content/drive/MyDrive/
