In [None]:
import json
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import nemo.collections.asr as nemo_asr

In [None]:
INPUT_CSV = './1_stage_preprocessed_data.csv'
AUDIO_FILES_DIR = '/media/real/data/uzbekvoice/clips'
OUTPUT_CSV = './2_stage_preprocessed_data.csv'



In [None]:
#load model  this model is trained on not cleaned dataset 
quartznet_saved= nemo_asr.models.EncDecCTCModel.restore_from("./saved_model/quartznet15x5.pt")


In [None]:
def get_inference_result(audio_filepath):
    # check if file exists 
    if not os.path.isfile(audio_filepath):
        return None    
    return quartznet_saved.transcribe(paths2audio_files=[audio_filepath])[0]



In [None]:
def levenshtein_distance(s1, s2):
    """Calculate the Levenshtein distance between two strings."""
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for index2, char2 in enumerate(s2):
        new_distances = [index2 + 1]
        for index1, char1 in enumerate(s1):
            if char1 == char2:
                new_distances.append(distances[index1])
            else:
                new_distances.append(1 + min((distances[index1], distances[index1 + 1], new_distances[-1])))
        distances = new_distances

    return distances[-1]

def calculate_error_rates(real_transcription, model_transcription):
    # Calculate Character Error Rate (CER)
    cer = levenshtein_distance(real_transcription, model_transcription) / len(real_transcription)

    # Calculate Sentence Error Rate (SER)
    ser = 0 if real_transcription == model_transcription else 1

    return cer, ser

# Example usage:
real_transcription = "hello borld"
model_transcription = "hello world"
cer, ser = calculate_error_rates(real_transcription, model_transcription)
print(f"Character Error Rate (CER): {cer:.2f}")
print(f"Sentence Error Rate (SER): {ser:.2f}")

In [None]:
#load 1 stage preprocessed data
df = pd.read_csv(INPUT_CSV)
df.info()

In [None]:
# audio_file_path is a f"/media/real/data/uzbekvoice/clips/{entry['client_id']}/{entry['original_sentence_id']}.wav"

df['audio_file_path'] = df.apply(lambda row: os.path.join(AUDIO_FILES_DIR, row['client_id'], f"{row['original_sentence_id']}.wav"), axis=1)



# do inference on all audio files and save inference results to new column and calculate error rates 
df['inference_result'] = df['audio_file_path'].apply(get_inference_result)

df['cer'] = df.apply(lambda row: calculate_error_rates(row['transcription'], row['inference_result'])[0], axis=1)

In [None]:
df 

In [None]:
# save to csv
df.to_csv(OUTPUT_CSV, index=False)