In [37]:
import os, shutil
import pandas as pd
import openai
from dotenv import load_dotenv

In [38]:
# GLobal variables
main_dataset_path = r'../data/cv-corpus-21.0-delta-2025-03-14/en'
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')


In [39]:
def load_dataset_metadata(
    dataset_path:str = r'../data/cv-corpus-21.0-delta-2025-03-14/en',
    ) -> pd.DataFrame:
    
    metadata_path = os.path.join(dataset_path, 'validated.tsv')
    df = pd.read_csv(metadata_path, sep='\t')
    
    # By adding sep='\t' parameter, we tell pandas to use tab as the separator
    # when reading the TSV file which is a tab seperated file
    
    return df

In [40]:
print("Dispaying the first few rows of the dataset metadata...")
df = load_dataset_metadata()
df.head()

Dispaying the first few rows of the dataset metadata...


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,116398939d6be70fc5fb532924a130c0adf286ac283499...,common_voice_en_41923025.mp3,f5a2a431746c5229ab696ba0e1a518fe7b26e208ff3b84...,"He was born at Wichenford, in Worcestershire, ...",,2,0,thirties,,United States English,,en,
1,24a4da2e8f053a45a0715849c222a40a4b0da9872efb2e...,common_voice_en_42356358.mp3,f6f009587d8812c147af1cc05079e1fcd8120c8a98cdf8...,The Portuguese division was overrun and withdr...,,2,0,teens,,United States English,,en,
2,30849595699bc853c3810a78448acede46888b4e2d0809...,common_voice_en_42165090.mp3,f69afa5e77812e8be0085c874d2a9767323c78ffb43ba6...,Her health by this stage was also poor.,,2,0,,,,,en,
3,42d53f34c1bc50f7a7c4ed1765a8d1ffeaf5cd441513cc...,common_voice_en_41921729.mp3,f5739acbefdbd3aac990792966fac4d40dcb39eb8dfa21...,His sporting interests outside of cricket incl...,,2,0,nineties,,England English,,en,
4,436b9e1f9da710d74eb01209f8f269bee70e93cadf2053...,common_voice_en_42528393.mp3,f7d35c60d76f025c45a9495757d1ee0e2b7c206317a288...,The following year he was elected to be part o...,,2,0,teens,,United States English,,en,


In [41]:
# Show all unique accent classes
print(df['accents'].unique())

# Or, to see counts for each class
print(df['accents'].value_counts())

['United States English' nan 'England English'
 'Australian English,Canadian English' 'Scottish English'
 'India and South Asia (India, Pakistan, Sri Lanka)' 'Russian' 'Filipino'
 'Malaysian English'
 'United States English,Southern United States English,Lightly Southern'
 'L2' 'United States English,Filipino' 'Lithuanian,Non-native'
 'British English / Received Pronunciation (RP)' 'Canadian English'
 'nigerian accent']
accents
nigerian accent                                                          95
United States English                                                    81
Canadian English                                                         31
British English / Received Pronunciation (RP)                             6
United States English,Filipino                                            5
Lithuanian,Non-native                                                     5
England English                                                           4
Scottish English                    

In [42]:
accents=[
        "United States English",
        "British English / Received Pronunciation (RP)",
        "nigerian accent",
        "Canadian English",
        "India and South Asia (India, Pakistan, Sri Lanka)"
         ]
limit=100

In [43]:
df = df[df["accents"].isin(accents) & df["path"].notna() & df["sentence"].notna()]
df.groupby("accents")
df.reset_index(drop=True).head()

Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,116398939d6be70fc5fb532924a130c0adf286ac283499...,common_voice_en_41923025.mp3,f5a2a431746c5229ab696ba0e1a518fe7b26e208ff3b84...,"He was born at Wichenford, in Worcestershire, ...",,2,0,thirties,,United States English,,en,
1,24a4da2e8f053a45a0715849c222a40a4b0da9872efb2e...,common_voice_en_42356358.mp3,f6f009587d8812c147af1cc05079e1fcd8120c8a98cdf8...,The Portuguese division was overrun and withdr...,,2,0,teens,,United States English,,en,
2,436b9e1f9da710d74eb01209f8f269bee70e93cadf2053...,common_voice_en_42528393.mp3,f7d35c60d76f025c45a9495757d1ee0e2b7c206317a288...,The following year he was elected to be part o...,,2,0,teens,,United States English,,en,
3,55158ab09dacdb9f29a4a9dbe649970d7d5e3c4f634bb9...,common_voice_en_42555516.mp3,f7adddb67a702c8dcb6804f28e1491ec76efbad6cb0c3b...,Safronov is the nearest rural locality.,,4,0,twenties,,United States English,,en,
4,63249207b46877a627fa51558278f322c17c459c56789e...,common_voice_en_42446785.mp3,f73d6aeb53eacc74834b9392149a751408bbf45d599196...,Bucknell tied for third in the Colonial League.,,2,0,fifties,,United States English,,en,


In [44]:
df['accents'].unique()

array(['United States English',
       'India and South Asia (India, Pakistan, Sri Lanka)',
       'British English / Received Pronunciation (RP)',
       'Canadian English', 'nigerian accent'], dtype=object)

In [45]:
len(df)

217

In [46]:
os.makedirs("data/audio", exist_ok=True)

In [53]:
samples = []

for index, row in df.iterrows():
    src = os.path.join(main_dataset_path, "clips", row['path'])
    dst = os.path.join("../data/audio", row['path'])
    if os.path.exists(src):
        shutil.copy(src, dst)
        samples.append({
            "path": row['path'],
            "sentence": row['sentence'],
            "accent": row['accents'],
            "age": row.get("age", "unknown"),
            "gender": row.get("gender", "unknown")
        })

metadata = pd.DataFrame(samples)
metadata = metadata[metadata['gender'].notna()]  # Filter out rows with NaN gender
metadata = metadata.reset_index(drop=True)        # Reset the index
metadata.to_csv("../data/metadata.csv", index=False)

In [54]:
metadata.head()

Unnamed: 0,path,sentence,accent,age,gender
0,common_voice_en_42594358.mp3,Political positions inside and outside the par...,United States English,thirties,female_feminine
1,common_voice_en_42216083.mp3,One area where training specific behavior has ...,United States English,fourties,male_masculine
2,common_voice_en_42466107.mp3,"However, the series was launched without this ...",United States English,teens,female_feminine
3,common_voice_en_42511436.mp3,He later commented that he did not support the...,United States English,sixties,female_feminine
4,common_voice_en_42511438.mp3,The truth will come out one day as it happens ...,United States English,sixties,female_feminine


In [48]:
len(metadata)

177

In [49]:
# call_whisper_api.py for openai >=1.0.0

import openai
from openai import OpenAI
from typing import Optional

client = OpenAI(api_key=api_key)

def transcribe_audio_file_with_api(audio_file_path: str, language: str = "en") -> Optional[str]:
    """
    Transcribe using OpenAI Whisper API (modern SDK version >=1.0.0).
    """
    try:
        with open(audio_file_path, "rb") as file:
            transcript = client.audio.transcriptions.create(
                model="whisper-1",
                file=file,
                language=language,
                response_format="text"
            )
        return transcript.strip()
    
    except Exception as e:
        print(f"[ERROR] API transcription failed for {audio_file_path}: {e}")
        return None


In [50]:
# Transcribing using Whisper

import os
import whisper
import pandas as pd
from jiwer import wer
from tqdm import tqdm # Progress bar for long operations for iterables

def transcribe_and_evaluate(metadata_path: str, 
                            audio_dir_path: str,
                            output_path:str = r'./data/transcriptions.csv') -> pd.DataFrame:

    # Load the whisper model loaclly throught openai-whipser
    model = whisper.load_model("base") # base, small, medium
    # base for speed small/ medium for accuracy
    
    # Load metadata
    metadata = pd.read_csv(metadata_path)
    
    results = []
    
    for index, row in tqdm(metadata.iterrows(), total=len(metadata)):
        audio_file_path = os.path.join(audio_dir_path, row['path'])
        # audio_file_path = os.path.abspath(audio_file_path)  # Convert to absolute path
       
        if not os.path.exists(audio_file_path):
            print(f"Audio file not found: {audio_file_path}")
            continue
        
        ground_truth = row['sentence']
        predicted_text = transcribe_audio_file_with_api(audio_file_path, language='en')
               
        if predicted_text is not None:
            error = wer(ground_truth.lower(), predicted_text.lower())
            
        # Append Results
        results.append({
            'audio_file': audio_file_path,
            'ground_truth': ground_truth,
            'predicted_text': predicted_text,
            'wer': error
        })
        
    # Convert the results into a dataframe and return
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False)
    return results_df

In [51]:
df_new = transcribe_and_evaluate(
    metadata_path= r"../data/metadata.csv",
    audio_dir_path= r"../data/audio",
    output_path= r"../data/transcriptions.csv"
)

100%|██████████| 177/177 [03:07<00:00,  1.06s/it]


In [52]:
df_new.head()

Unnamed: 0,audio_file,ground_truth,predicted_text,wer
0,../data/audio\common_voice_en_42594358.mp3,Political positions inside and outside the par...,Political positions inside and outside the par...,0.0
1,../data/audio\common_voice_en_42216083.mp3,One area where training specific behavior has ...,One area where training-specific behavior has ...,0.153846
2,../data/audio\common_voice_en_42466107.mp3,"However, the series was launched without this ...","However, the series was launched without this ...",0.0
3,../data/audio\common_voice_en_42511436.mp3,He later commented that he did not support the...,He later commented that he did not support the...,0.0
4,../data/audio\common_voice_en_42511438.mp3,The truth will come out one day as it happens ...,The truth will come out one day as it happens ...,0.0
