In [None]:
pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.3-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.10.3


In [None]:
pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
!pip install --upgrade pydub



In [None]:
import os
import csv
import speech_recognition as sr
from pydub import AudioSegment
from pydub.playback import play

def transcribe_whispering_audio(audio_file):
    # Convert MP3 to WAV
    wav_file = audio_file[:-4] + ".wav"  # Change file extension to .wav
    sound = AudioSegment.from_mp3(audio_file)


    sound.export(wav_file, format="wav")


    recognizer = sr.Recognizer()


    recognizer.energy_threshold = 100

    with sr.AudioFile(wav_file) as source:
        audio = recognizer.record(source)

    try:
        text = recognizer.recognize_google(audio)
        os.remove(wav_file)
        return text.strip(), True
    except sr.UnknownValueError:
        print(f"Error: Could not understand audio file {audio_file}")
        os.remove(wav_file)
        return "", False
    except sr.RequestError as e:
        print(f"Error: Could not request results from Google Speech Recognition service; {e}")
        os.remove(wav_file)
        return "", False

def evaluate_accuracy(audio_folder, csv_file):
    total_files = 0
    correctly_transcribed = 0
    false_positives = 0
    false_negatives = 0
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            audio_file = os.path.join(audio_folder, row[0])
            ground_truth_caption = row[1]
            generated_caption, success = transcribe_whispering_audio(audio_file)
            if success:
                total_files += 1
                if ground_truth_caption.lower() == generated_caption.lower():
                    correctly_transcribed += 1
                else:
                    false_negatives += 1
                    print(f"Audio file: {row[0]}, Ground Truth: {ground_truth_caption}, Generated: {generated_caption}")
            else:
                false_positives += 1
                print(f"Failed to transcribe audio file: {row[0]}")

    accuracy = (correctly_transcribed / total_files) * 100 if total_files > 0 else 0
    precision = correctly_transcribed / (correctly_transcribed + false_positives) if (correctly_transcribed + false_positives) > 0 else 0
    recall = correctly_transcribed / (correctly_transcribed + false_negatives) if (correctly_transcribed + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Total files: {total_files}, Correctly transcribed: {correctly_transcribed}")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1_score:.2f}")

def main():
    dataset_folder = "/content/drive/MyDrive/speech-to-text-dataset"
    audio_folder = os.path.join(dataset_folder, "cv-other-train")
    csv_file = os.path.join(dataset_folder, "cv-other-train.csv")
    evaluate_accuracy(audio_folder, csv_file)

main()

Audio file: sample-000000.mp3, Ground Truth: he had to spit some tobacco out of his mouth, Generated: you have to spend some tobacco out of his mouth
Audio file: sample-000001.mp3, Ground Truth: it took her a while to get used to it, Generated: Dakota oil and get you stick
Error: Could not understand audio file /content/drive/MyDrive/speech-to-text-dataset/cv-other-train/sample-000003.mp3
Failed to transcribe audio file: sample-000003.mp3
Audio file: sample-000005.mp3, Ground Truth: we're a couple of financial wizards, Generated: where a couple of financial Wizards
Audio file: sample-000006.mp3, Ground Truth: better hurry and shave, Generated: Veterinary and shave
Audio file: sample-000008.mp3, Ground Truth: you called that number not ten minutes ago, Generated: you called that number not 10 minutes ago
Audio file: sample-000010.mp3, Ground Truth: those women are all card sharks don't be fooled, Generated: those women are all card sharks don't be foe
Audio file: sample-000011.mp3, Grou