In [3]:
%pip install pandas
%pip install pydub
%pip install soundfile
%pip install jiwer
%pip install dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import pandas as pd

def build_file_table(folder_path):
  filepaths = []
  filenames = []

  if not os.path.isdir(folder_path):
    print(f"Error: Folder not found at '{folder_path}'")
    return pd.DataFrame({'filepath': [], 'filename': []})

  for item in os.listdir(folder_path):
    item_path = os.path.join(folder_path, item)
    if os.path.isfile(item_path) and item.lower().endswith(".wav"):
      filepaths.append(item_path)
      filenames.append(os.path.splitext(item)[0])  # Remove the .wav extension

  df = pd.DataFrame({'filepath': filepaths, 'filename': filenames})
  return df
  

In [5]:
folder_path = "dataset" 
df = build_file_table(folder_path)

if df.empty:
    print("No .wav files found in the specified folder.")

In [6]:
from dotenv import load_dotenv
import os
load_dotenv()

MESOLITICA_TOKEN = os.environ.get("MESOLITICA_TOKEN")
MESOLITICA_API_URL = "https://api.mesolitica.com/audio/transcriptions";

if not MESOLITICA_TOKEN:
    print("WARNING: MESOLITICA_TOKEN environment variable not set.")

In [None]:
import requests

def transcribe_audio(path: str):
    with open(path, "rb") as f:
        file = {"file": f}
        headers = {
            'Authorization': f"Bearer {MESOLITICA_TOKEN}"
        }
        data = {
            'model': 'base',
            'response_format': 'text',
            'language': 'ms'
        }

        res = requests.post(MESOLITICA_API_URL, files=file, data=data, headers=headers)
        res.raise_for_status() 
        return res.json()


In [8]:
output_texts = []

for path, name in zip(df["filepath"], df["filename"]):
    text = transcribe_audio(path);
    output_texts.append(text)

df_output = pd.DataFrame({'filepath': df["filepath"], 'text': df["filename"], 'output_text': output_texts})
df_output.head()

Unnamed: 0,filepath,text,output_text
0,dataset/badan bukan kerajaan ngo.wav,badan bukan kerajaan ngo,Badan ni bukan kerajaan NGO
1,dataset/air pada masa sekarang.wav,air pada masa sekarang,Air pada masa sekarang
2,dataset/atas permukaan daun jpg.wav,atas permukaan daun jpg,Atas permukaan daun JPG
3,dataset/air pasang beliau hilang.wav,air pasang beliau hilang,Air pasang. Beliau hilang.
4,dataset/algeria perdebatan mengenai sama.wav,algeria perdebatan mengenai sama,"Algeria, perdapatan mengenai sama"


In [9]:
df_output

Unnamed: 0,filepath,text,output_text
0,dataset/badan bukan kerajaan ngo.wav,badan bukan kerajaan ngo,Badan ni bukan kerajaan NGO
1,dataset/air pada masa sekarang.wav,air pada masa sekarang,Air pada masa sekarang
2,dataset/atas permukaan daun jpg.wav,atas permukaan daun jpg,Atas permukaan daun JPG
3,dataset/air pasang beliau hilang.wav,air pasang beliau hilang,Air pasang. Beliau hilang.
4,dataset/algeria perdebatan mengenai sama.wav,algeria perdebatan mengenai sama,"Algeria, perdapatan mengenai sama"
...,...,...,...
195,dataset/bahasa hindi bahasa urdu.wav,bahasa hindi bahasa urdu,"Bahasa Hindi, bahasa Uduk"
196,dataset/aitken lembangan lembangan hentaman.wav,aitken lembangan lembangan hentaman,Aikin Lembangan-Lembangan Hentaman
197,dataset/atau perbezaan sebutan bahasa.wav,atau perbezaan sebutan bahasa,Atau perbezaan sebutan bahasa
198,dataset/aitken membolehkan british meningkatka...,aitken membolehkan british meningkatkan,Ed King membolehkan British meningkatkan


In [None]:
import jiwer

transforms = jiwer.Compose(
    [
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

In [30]:
def evaluation(df):
    wer_scores = []
    mer_scores = []
    cer_scores = []
    wil_scores = []
    wip_scores = []

    for index, row in df.iterrows():
        reference = (row['text'])
        hypothesis = (row['output_text']).replace("-", " ")
        if reference is not None and hypothesis is not None:
            reference_words = reference
            hypothesis_words = hypothesis
            if reference_words:
                minimum_len = min(len(reference_words), len(hypothesis_words))
                wer = jiwer.wer(reference_words, hypothesis_words, reference_transform=transforms, hypothesis_transform=transforms)
                mer = jiwer.mer(reference_words, hypothesis_words, reference_transform=transforms, hypothesis_transform=transforms)
                cer = jiwer.cer(reference_words, hypothesis_words, reference_transform=transforms, hypothesis_transform=transforms)
                wil = jiwer.wil(reference_words, hypothesis_words, reference_transform=transforms, hypothesis_transform=transforms)
                wip = jiwer.wip(reference_words, hypothesis_words, reference_transform=transforms, hypothesis_transform=transforms)
                wer_scores.append(wer)
                mer_scores.append(mer)
                cer_scores.append(cer)
                wil_scores.append(wil)
                wip_scores.append(wip)
            else:
                wer_scores.append(None)
        else:
            wer_scores.append(None) 

    df['wer'] = wer_scores
    df['mer'] = mer_scores
    df['cer'] = cer_scores
    df['wil'] = wil_scores
    df['wip'] = wip_scores
    return df

In [31]:
df_eval = evaluation(df_output.copy()) 
df_eval

Unnamed: 0,filepath,text,output_text,wer,mer,cer,wil,wip
0,dataset/badan bukan kerajaan ngo.wav,badan bukan kerajaan ngo,Badan ni bukan kerajaan NGO,0.25,0.20,0.25,0.2000,0.8000
1,dataset/air pada masa sekarang.wav,air pada masa sekarang,Air pada masa sekarang,0.00,0.00,0.00,0.0000,1.0000
2,dataset/atas permukaan daun jpg.wav,atas permukaan daun jpg,Atas permukaan daun JPG,0.00,0.00,0.00,0.0000,1.0000
3,dataset/air pasang beliau hilang.wav,air pasang beliau hilang,Air pasang. Beliau hilang.,0.00,0.00,0.00,0.0000,1.0000
4,dataset/algeria perdebatan mengenai sama.wav,algeria perdebatan mengenai sama,"Algeria, perdapatan mengenai sama",0.25,0.25,0.25,0.4375,0.5625
...,...,...,...,...,...,...,...,...
195,dataset/bahasa hindi bahasa urdu.wav,bahasa hindi bahasa urdu,"Bahasa Hindi, bahasa Uduk",0.25,0.25,0.25,0.4375,0.5625
196,dataset/aitken lembangan lembangan hentaman.wav,aitken lembangan lembangan hentaman,Aikin Lembangan-Lembangan Hentaman,0.25,0.25,0.25,0.4375,0.5625
197,dataset/atau perbezaan sebutan bahasa.wav,atau perbezaan sebutan bahasa,Atau perbezaan sebutan bahasa,0.00,0.00,0.00,0.0000,1.0000
198,dataset/aitken membolehkan british meningkatka...,aitken membolehkan british meningkatkan,Ed King membolehkan British meningkatkan,0.50,0.40,0.50,0.5500,0.4500


In [32]:
avg_wer_rate = sum(df_eval['wer']) / len(df_eval['wer'])
avg_mer_rate = sum(df_eval['mer']) / len(df_eval['mer'])
avg_cer_rate = sum(df_eval['cer']) / len(df_eval['cer'])
avg_wil_rate = sum(df_eval['wil']) / len(df_eval['wil'])
avg_wip_rate = sum(df_eval['wip']) / len(df_eval['wip'])
print(f"Average WER (Word Error Rate): {avg_wer_rate}")
print(f"Average MER (Match Error Rate): {avg_mer_rate}")
print(f"Average CER (Character Error Rate): {avg_cer_rate}")
print(f"Average WIL (Word Information Loss): {avg_wil_rate}")
print(f"Average WIP (Word Information Preserved): {avg_wip_rate}")

Average WER (Word Error Rate): 0.11625
Average MER (Match Error Rate): 0.10625
Average CER (Character Error Rate): 0.11625
Average WIL (Word Information Loss): 0.15602083333333333
Average WIP (Word Information Preserved): 0.8439791666666666
