In [None]:
!pip install SpeechRecognition


Collecting SpeechRecognition
  Downloading SpeechRecognition-3.13.0-py3-none-any.whl.metadata (30 kB)
Downloading SpeechRecognition-3.13.0-py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.13.0


In [None]:
import os
import wave
import nltk
from nltk.corpus import cmudict
from difflib import SequenceMatcher
from pydub import AudioSegment
import speech_recognition as sr  # Import for speech recognition

# Download and load CMU Pronouncing Dictionary
nltk.download('cmudict')
cmu_dict = cmudict.dict()

# Function to get ARPABET pronunciation for a word
def get_pronunciation(word):
    return cmu_dict.get(word.lower(), None)

# Function to calculate similarity between two phoneme sequences
def phoneme_similarity(seq1, seq2):
    return SequenceMatcher(None, seq1, seq2).ratio()

# Function to find the closest word based on phoneme similarity
def find_closest_word(mispronounced_word):
    mispronounced_pron = get_pronunciation(mispronounced_word)
    if not mispronounced_pron:
        return None, None  # No pronunciation found in CMUdict

    mispronounced_phonemes = mispronounced_pron[0]
    best_match = None
    highest_similarity = 0

    for word, pronunciations in cmu_dict.items():
        for pronunciation in pronunciations:
            similarity = phoneme_similarity(mispronounced_phonemes, pronunciation)
            if similarity > highest_similarity:
                highest_similarity = similarity
                best_match = word

    return best_match, highest_similarity

# Function to correct mispronounced words
def correct_mispronunciation(mispronounced_word):
    corrected_word, similarity = find_closest_word(mispronounced_word)
    if corrected_word:
        return {
            "input": mispronounced_word,
            "corrected": corrected_word,
            "similarity": similarity
        }
    else:
        return {
            "input": mispronounced_word,
            "corrected": None,
            "similarity": 0
        }

# Function to convert audio to a supported format
def convert_to_supported_format(input_file, output_file):
    try:
        audio = AudioSegment.from_file(input_file)
        # Convert to mono and set the sample rate to 16 kHz
        audio = audio.set_frame_rate(16000).set_channels(1)
        # Export the file in WAV format
        audio.export(output_file, format="wav")
        print(f"Converted audio saved as: {output_file}")
        return output_file
    except Exception as e:
        print(f"Error converting audio file: {e}")
        return None

# Function to capture voice input from a .wav file
def capture_voice_from_wav(wav_file_path):
    recognizer = sr.Recognizer()

    if not os.path.exists(wav_file_path):
        print("Error: File does not exist.")
        return None

    try:
        with sr.AudioFile(wav_file_path) as source:
            audio = recognizer.record(source)
            print("Processing the audio file...")
            text = recognizer.recognize_google(audio)
            print(f"Recognized Text: {text}")
            return text
    except sr.UnknownValueError:
        print("Sorry, I could not understand the audio.")
        return None
    except sr.RequestError as e:
        print(f"There was an issue connecting to the speech recognition service: {e}")
        return None

# Main function
if __name__ == "__main__":
    # Specify the path to your original audio file
    input_audio_file = "/content/Recording (18).m4a"
    # Specify the output path for the converted WAV file
    converted_audio_file = "/content/converted_audio.wav"

    print("Converting the audio file to a supported format...")
    converted_file = convert_to_supported_format(input_audio_file, converted_audio_file)

    if converted_file:
        print("Processing voice input from the converted audio file...")
        voice_input = capture_voice_from_wav(converted_file)

        if voice_input:
            words = voice_input.split()
            for word in words:
                result = correct_mispronunciation(word)
                print(f"Input: {result['input']}, Corrected: {result['corrected']}, Similarity: {result['similarity']:.2f}")


[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


Converting the audio file to a supported format...
Converted audio saved as: /content/converted_audio.wav
Processing voice input from the converted audio file...
Processing the audio file...
Recognized Text: true
Input: true, Corrected: treu, Similarity: 1.00


In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa

# Load pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Function to transcribe audio using Wav2Vec2
def transcribe_audio_wav2vec2(audio_path):
    audio, rate = librosa.load(audio_path, sr=16000)  # Load and resample audio
    input_values = processor(audio, sampling_rate=rate, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

# Example usage
audio_file = "/content/converted_audio.wav"
print("Transcription:", transcribe_audio_wav2vec2(audio_file))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Transcription: DO


In [None]:
from difflib import SequenceMatcher

# Function to calculate phoneme similarity
def detect_mispronunciations(transcribed_text, ground_truth):
    similarity = SequenceMatcher(None, transcribed_text, ground_truth).ratio()
    return similarity

# Example usage
ground_truth = "example phoneme sequence"
transcription = "transcribed phoneme sequence"
print("Phoneme similarity:", detect_mispronunciations(transcription, ground_truth))


Phoneme similarity: 0.7307692307692307
