<a href="https://colab.research.google.com/github/MK316/Spring2024/blob/main/Engpro/SpeechFeedback0604.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio>=2.9.4
!pip install SpeechRecognition>=3.8.1
!pip install python-Levenshtein>=0.12.2
!pip install SoundFile>=0.10.3.post1
# pandas>=1.3.5
# numpy>=1.21.6

In [None]:
import gradio as gr
import speech_recognition as sr
from Levenshtein import ratio
import tempfile
import numpy as np
import soundfile as sf
import pandas as pd

# Sample dataframe with sentences ordered from easy to hard
data = {
    "Sentences": [
        "A stitch in time saves nine.",
        "To be or not to be, that is the question.",
        "Five cats were living in safe caves.",
        "Hives give shelter to bees in large caves.",
        "His decision to plant a rose was amazing.",
        "She sells sea shells by the sea shore.",
        "The colorful parrot likes rolling berries.",
        "Time flies like an arrow; fruit flies like a banana.",
        "Good things come to those who wait.",
        "All human beings are born free and equal in dignity and rights."
    ]
}
df = pd.DataFrame(data)

def transcribe_audio(file_info):
    r = sr.Recognizer()
    with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
        sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
        tmpfile.seek(0)
        with sr.AudioFile(tmpfile.name) as source:
            audio_data = r.record(source)
    try:
        text = r.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError as e:
        return f"Could not request results; {e}"

def pronunciation_correction(expected_text, file_info):
    user_spoken_text = transcribe_audio(file_info)
    similarity = ratio(expected_text.lower(), user_spoken_text.lower())
    description = f"{similarity:.2f}"

    if similarity >= 0.9:
        feedback = "Excellent pronunciation!"
    elif similarity >= 0.7:
        feedback = "Good pronunciation!"
    elif similarity >= 0.5:
        feedback = "Needs improvement."
    else:
        feedback = "Poor pronunciation, try to focus more on clarity."

    return feedback, description

with gr.Blocks() as app:
    with gr.Row():
        sentence_dropdown = gr.Dropdown(choices=df['Sentences'].tolist(), label="Select a Sentence")
        selected_sentence_output = gr.Textbox(label="Selected Text", interactive=False)
    audio_input = gr.Audio(label="Upload Audio File", type="numpy")
    check_pronunciation_button = gr.Button("Check Pronunciation")
    pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
    pronunciation_score = gr.Number(label="Pronunciation Accuracy Score: 0 (No Match) ~ 1 (Perfect)")

    sentence_dropdown.change(lambda x: x, inputs=sentence_dropdown, outputs=selected_sentence_output)
    check_pronunciation_button.click(
        pronunciation_correction,
        inputs=[sentence_dropdown, audio_input],
        outputs=[pronunciation_feedback, pronunciation_score]
    )

app.launch(debug=True)

In [None]:
import gradio as gr
import speech_recognition as sr
from Levenshtein import ratio
import tempfile
import soundfile as sf

def transcribe_audio(file_info):
    r = sr.Recognizer()
    with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
        # Write the sound file to the temporary file
        sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
        tmpfile.seek(0)
        with sr.AudioFile(tmpfile.name) as source:
            audio_data = r.record(source)  # Read the entire audio file
    try:
        text = r.recognize_google(audio_data)  # Using Google Web Speech API to transcribe the audio
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError as e:
        return f"Could not request results; {e}"

def pronunciation_correction(expected_text, file_info):
    user_spoken_text = transcribe_audio(file_info)
    similarity = ratio(expected_text.lower(), user_spoken_text.lower())  # Calculate the Levenshtein ratio
    description = f"{similarity:.2f}"  # Format similarity score to 2 decimal places

    if similarity >= 0.9:
        feedback = "Excellent pronunciation!"
    elif similarity >= 0.7:
        feedback = "Good pronunciation!"
    elif similarity >= 0.5:
        feedback = "Needs improvement."
    else:
        feedback = "Poor pronunciation, try to focus more on clarity."

    return feedback, description

with gr.Blocks() as app:
    with gr.Row():
        text_input = gr.Textbox(label="Enter or paste your text here")
    audio_input = gr.Audio(label="Upload Audio File", type="numpy")
    check_pronunciation_button = gr.Button("Check Pronunciation")
    pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
    pronunciation_score = gr.Number(label="Pronunciation Accuracy Score: 0 (No Match) ~ 1 (Perfect)")

    check_pronunciation_button.click(
        pronunciation_correction,
        inputs=[text_input, audio_input],
        outputs=[pronunciation_feedback, pronunciation_score]
    )

app.launch(debug=True)


# Pronunciation Feedback: WER, Fluency, WPM

Fluency checking

In [None]:
!pip install librosa

+ Sample text: the rainbow passage
+ Native samples:

|Speaker|WER|Fluency|WPM|
|--|--|--|--|
|Female| WER: 0.11| Fluency: 70 pauses| 174 WPM|
|Male | WER: 0.12| Fluency: 73 pauses| 168 WPM|

In [None]:
#@markdown Language Application: WER, Fluency (in N of pauses), WPM (Words per minute)
import gradio as gr
import speech_recognition as sr
from Levenshtein import distance as lev_distance, ratio
import tempfile
import soundfile as sf
import librosa

def analyze_speech(file_info):
    r = sr.Recognizer()
    with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
        # Write the sound file to the temporary file
        sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
        tmpfile.seek(0)

        # Load audio for pause analysis and speech rate
        y, sr_lib = librosa.load(tmpfile.name, sr=None)  # Load the file with the original sampling rate
        duration = librosa.get_duration(y=y, sr=sr_lib)

        # Detect pauses
        pause_frames = librosa.effects.split(y, top_db=32)
        pauses = [(start, end) for start, end in pause_frames if (end - start) / sr_lib > 0.5]
        num_pauses = len(pauses)

        with sr.AudioFile(tmpfile.name) as source:
            audio_data = r.record(source)
        text = r.recognize_google(audio_data)

        return text, num_pauses, duration, len(text.split())

def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    edit_distance = lev_distance(ref_words, hyp_words)
    wer = edit_distance / len(ref_words) if ref_words else float('inf')  # Avoid division by zero
    return wer

def pronunciation_correction(expected_text, file_info):
    user_spoken_text, num_pauses, duration, total_words = analyze_speech(file_info)
    wer = calculate_wer(expected_text.lower(), user_spoken_text.lower())
    wpm = total_words / (duration / 60) if duration > 0 else 0
    similarity = ratio(expected_text.lower(), user_spoken_text.lower())

    feedback = "Excellent pronunciation!" if similarity >= 0.9 else \
               "Good pronunciation!" if similarity >= 0.7 else \
               "Needs improvement." if similarity >= 0.5 else \
               "Poor pronunciation, try to focus more on clarity."

    description = f"WER: {wer:.2f}, Fluency: {num_pauses} pauses, {wpm:.0f} WPM"

    return feedback, description

with gr.Blocks() as app:
    with gr.Row():
        text_input = gr.Textbox(label="Enter or paste your text here")
    audio_input = gr.Audio(label="Upload Audio File", type="numpy")
    check_pronunciation_button = gr.Button("Check Pronunciation")
    pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
    pronunciation_details = gr.Textbox(label="Detailed Metrics")

    check_pronunciation_button.click(
        pronunciation_correction,
        inputs=[text_input, audio_input],
        outputs=[pronunciation_feedback, pronunciation_details]
    )

app.launch(debug=True)
