In [12]:
import tkinter as tk
from tkinter import simpledialog
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import speech_recognition as sr
import difflib
from deepmultilingualpunctuation import PunctuationModel
import editdistance
from sklearn.metrics import precision_score, recall_score, f1_score


In [13]:
def get_input_text():
    root = tk.Tk()
    root.withdraw()  # Hide the main window
    user_input = simpledialog.askstring(title="Input", prompt="Please enter the text you will speak:")
    return user_input


In [14]:
def record_audio(duration=10, fs=44100):
    print("Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=2, dtype='int16')
    sd.wait()  # Wait until the recording is finished
    print("Recording finished")
    return audio, fs


In [15]:
def recognize_speech_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)
        try:
            recognized_text = recognizer.recognize_google(audio)
            print("Raw Recognized Text:", recognized_text)
            return recognized_text
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand audio")
            return ""
        except sr.RequestError as e:
            print("Could not request results; {0}".format(e))
            return ""


In [16]:
def add_punctuation_and_capitalization(text):
    model = PunctuationModel()
    punctuated_text = model.restore_punctuation(text)
    
    #Capitalize the first letter of each sentence
    punctuated_text = '. '.join(sentence.capitalize() for sentence in punctuated_text.split('. '))
    
    return punctuated_text

In [17]:
def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    
    # Calculate the edit distance
    edit_dist = editdistance.eval(ref_words, hyp_words)
    
    # Calculate the number of deletions, insertions, and substitutions
    num_words = len(ref_words)
    wer = (edit_dist / num_words) * 100 if num_words > 0 else 0
    return wer

def calculate_cer(reference, hypothesis):
    ref_chars = list(reference)
    hyp_chars = list(hypothesis)
    
    # Calculate the edit distance
    edit_dist = editdistance.eval(ref_chars, hyp_chars)
    
    # Calculate the number of deletions, insertions, and substitutions
    num_chars = len(ref_chars)
    cer = (edit_dist / num_chars) * 100 if num_chars > 0 else 0
    return cer


def calculate_precision_recall_f1(reference, hypothesis):
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    
    common = set(ref_words).intersection(set(hyp_words))
    tp = len(common)
    fp = len(hyp_words) - tp
    fn = len(ref_words) - tp
    
    precision = precision_score(ref_words, hyp_words, average='micro')
    recall = recall_score(ref_words, hyp_words, average='micro')
    f1 = f1_score(ref_words, hyp_words, average='micro')
    
    return precision * 100, recall * 100, f1 * 100

In [18]:
def compare_texts(input_text, recognized_text):
    input_words = input_text.split()
    recognized_words = recognized_text.split()
    diff = difflib.Differ().compare(input_words, recognized_words)
    return list(diff)



In [19]:
def calculate_index_based_accuracy(input_text, recognized_text):
    input_words = input_text.split()
    recognized_words = recognized_text.split()
    matches = 0
    for i in range(min(len(input_words), len(recognized_words))):
        if input_words[i].lower() == recognized_words[i].lower():
            matches += 1
    accuracy = (matches / len(input_words)) * 100 if input_words else 0
    return accuracy

In [20]:
def calculate_accuracy(input_text, recognized_text):
    input_words = input_text.lower().split()
    recognized_words = recognized_text.lower().split()
    matched_words = set(input_words).intersection(recognized_words)
    accuracy = len(matched_words) / len(set(input_words)) if input_words else 0
    return accuracy * 100

In [21]:
def word_by_word_comparison(input_text, recognized_text):
    input_words = input_text.split()
    recognized_words = recognized_text.split()
    comparison = []
    for i in range(max(len(input_words), len(recognized_words))):
        if i < len(input_words) and i < len(recognized_words):
            comparison.append((input_words[i], recognized_words[i], input_words[i] == recognized_words[i]))
        elif i < len(input_words):
            comparison.append((input_words[i], None, False))
        else:
            comparison.append((None, recognized_words[i], False))
    return comparison

In [23]:
# Main flow
input_text = get_input_text()
print("Input Text:", input_text)

audio, fs = record_audio()
wav.write("output.wav", fs, audio)

recognized_text = recognize_speech_from_audio("output.wav")
punctuated_text = add_punctuation_and_capitalization(recognized_text)

print("-----------------------------------------------------------------------")
print("FINAL SPEECH TO TEXT: ", punctuated_text)

print("-----------------------------------------------------------------------")
accuracy = calculate_accuracy(input_text, punctuated_text)
print(f"Overall Accuracy: {accuracy:.2f}%")

#absolute_accuracy = calculate_abosulte_accuracy(input_text, punctuated_text)
#print(f"Overall absolute Accuracy: {accuracy:.2f}%")

wer = calculate_wer(input_text, punctuated_text)
print(f"Word Error Rate: {wer:.2f}%")

cer = calculate_cer(input_text, punctuated_text)
precision, recall, f1 = calculate_precision_recall_f1(input_text, punctuated_text)
index_based_accuracy = calculate_index_based_accuracy(input_text, punctuated_text)


print(f"Character Error Rate: {cer:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1 Score: {f1:.2f}%")
print(f"Index-based Word Accuracy: {index_based_accuracy:.2f}%")
print("-----------------------------------------------------------------------")

comparison_result = compare_texts(input_text, punctuated_text)
print("Comparison Result:", comparison_result)

word_comparison = word_by_word_comparison(input_text, punctuated_text)
for original, recognized, match in word_comparison:
    print(f"Original: {original}, Recognized: {recognized}, Match: {match}")
print("-----------------------------------------------------------------------")    

Input Text: Hello, how are you?
Recording...
Recording finished
Raw Recognized Text: hello how are you




-----------------------------------------------------------------------
FINAL SPEECH TO TEXT:  Hello, how are you?
-----------------------------------------------------------------------
Overall Accuracy: 100.00%
Word Error Rate: 0.00%
Character Error Rate: 0.00%
Precision: 100.00%
Recall: 100.00%
F1 Score: 100.00%
Index-based Word Accuracy: 100.00%
-----------------------------------------------------------------------
Comparison Result: ['  Hello,', '  how', '  are', '  you?']
Original: Hello,, Recognized: Hello,, Match: True
Original: how, Recognized: how, Match: True
Original: are, Recognized: are, Match: True
Original: you?, Recognized: you?, Match: True
-----------------------------------------------------------------------


In [33]:
def calculate_abosulte_accuracy(input_text, recognized_text):
    input_words = input_text.split()
    recognized_words = recognized_text.split()
    matched_words = sum(1 for i, j in zip(input_words, recognized_words) if i == j)
    total_words = len(input_words)
    accuracy = matched_words / total_words if total_words > 0 else 0
    return accuracy * 100