In [None]:
import csv
import os
import shutil

# Path to the CSV file
csv_file_path = "drive/MyDrive/filenames.csv"

# Path to the folder with 5.8k audio files
audio_folder_path = "drive/MyDrive/test"

# Path to the folder where you want to save the selected audio files
output_folder_path = "drive/MyDrive/train_500"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

# Read the CSV file and extract the audio file names
with open(csv_file_path, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    audio_files = [row[0] for row in csvreader]

# Iterate through each audio file name and copy it to the output folder if it exists
for audio_file in audio_files:
    src_file_path = os.path.join(audio_folder_path, audio_file)
    if os.path.exists(src_file_path):
        dest_file_path = os.path.join(output_folder_path, audio_file)
        shutil.copy(src_file_path, dest_file_path)
    else:
        print(f"File not found: {audio_file}")


In [11]:
!pip install SpeechRecognition



In [None]:
import os
import pandas as pd
import speech_recognition as sr

recognizer = sr.Recognizer()

folder_path = "drive/MyDrive/train_500"
dfs = []

for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        audio_file = os.path.join(folder_path, filename)
        with sr.AudioFile(audio_file) as source:
            audio_data = recognizer.record(source)

        # Use Google Web Speech API for transcription
        try:
            text = recognizer.recognize_google(audio_data)
            df = pd.DataFrame({'filename': [filename], 'transcript': [text]})
            dfs.append(df)
        except sr.UnknownValueError:
            print(f"Could not understand audio: {filename}")
        except sr.RequestError as e:
            print(f"Could not request results for audio {filename}: {e}")

# Concatenate all DataFrames into a single DataFrame
result = pd.concat(dfs, ignore_index=True)



In [13]:
result.rename(columns={'filename': 'file_name'}, inplace=True)

In [15]:
result.head(5)

Unnamed: 0,file_name,transcript
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet
1,1249120_13842059_105045085.wav,the pain feels like it's right below the skin
2,1249120_13842059_11964685.wav,I feel suicidal
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I
4,1249120_13842059_13041979.wav,my shoulder hurts me so much


In [None]:
import pandas as pd

actual_transcripts_df = pd.read_csv("drive/MyDrive/overview-of-recordings.csv")

actual_transcripts_subset = actual_transcripts_df[['phrase', 'file_name']]

result_with_actual_transcripts = pd.merge(result, actual_transcripts_subset, on="file_name", how="left")


In [17]:
result_with_actual_transcripts.rename(columns={'phrase': 'actual_transcript'}, inplace=True)
result_with_actual_transcripts.head(5)

Unnamed: 0,file_name,transcript,actual_transcript
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet,I have a painful cramp in my feet
1,1249120_13842059_105045085.wav,the pain feels like it's right below the skin,The pain feels like it's right below the skin
2,1249120_13842059_11964685.wav,I feel suicidal,I feel suicidal.
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I,I feel a sharp pain in my ankle joint when I s...
4,1249120_13842059_13041979.wav,my shoulder hurts me so much,My shoulder hurts me so much


In [None]:
df = result_with_actual_transcripts

In [18]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.25.0 (from python-Levenshtein)
  Downloading Levenshtein-0.25.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.25.0->python-Levenshtein)
  Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.25.0 python-Levenshtein-0.25.0 rapidfuzz-3.6.1


In [19]:
import Levenshtein

def wer(reference, hypothesis):
    """
    Calculate Word Error Rate (WER) between reference and hypothesis.
    """
    reference_words = reference.split()
    hypothesis_words = hypothesis.split()

    distance = Levenshtein.distance(reference_words, hypothesis_words)
    wer = distance / len(reference_words)
    return wer

def cer(reference, hypothesis):
    """
    Calculate Character Error Rate (CER) between reference and hypothesis.
    """
    distance = Levenshtein.distance(reference, hypothesis)
    cer = distance / len(reference)
    return cer

In [23]:
df = result_with_actual_transcripts.drop(columns=['WER_Score'])
df = result_with_actual_transcripts.drop(columns=['CER_Score'])

In [29]:
df.head(5)

Unnamed: 0,file_name,transcript,actual_transcript,WER_Score
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet,I have a painful cramp in my feet,0.112245
1,1249120_13842059_105045085.wav,the pain feels like it's right below the skin,The pain feels like it's right below the skin,0.112245
2,1249120_13842059_11964685.wav,I feel suicidal,I feel suicidal.,0.112245
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I,I feel a sharp pain in my ankle joint when I s...,0.112245
4,1249120_13842059_13041979.wav,my shoulder hurts me so much,My shoulder hurts me so much,0.112245


In [32]:
wer_scores = []
cer_scores = []
for index, rows in df.iterrows():
    reference_text = str(rows['actual_transcript'])
    hypothesis_text = str(rows['transcript'])

    wer_score = wer(reference_text, hypothesis_text)
    cer_score = cer(reference_text, hypothesis_text)

    wer_scores.append(wer_score)
    cer_scores.append(cer_score)

df['WER_Score'] = wer_scores
df['CER_Score'] = cer_scores
df.head(5)


Unnamed: 0,file_name,transcript,actual_transcript,WER_Score,CER_Score
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet,I have a painful cramp in my feet,0.0,0.0
1,1249120_13842059_105045085.wav,the pain feels like it's right below the skin,The pain feels like it's right below the skin,0.111111,0.022222
2,1249120_13842059_11964685.wav,I feel suicidal,I feel suicidal.,0.333333,0.0625
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I,I feel a sharp pain in my ankle joint when I s...,0.083333,0.137255
4,1249120_13842059_13041979.wav,my shoulder hurts me so much,My shoulder hurts me so much,0.166667,0.035714


In [33]:
 #Calculate the average of 'were_score' and 'car_score'
average_were_score = result_with_actual_transcripts['WER_Score'].mean()
average_car_score = result_with_actual_transcripts['CER_Score'].mean()

print(f"Average were_score: {average_were_score}")
print(f"Average car_score: {average_car_score}")

Average were_score: 0.11224489795918366
Average car_score: 0.07079646017699114
