In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import torchaudio
import os

: 

In [None]:

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")


In [None]:

timit_path = r"timit\data\TRAIN\DR1\FCJF0"  # Update this path as necessary
audio_file = os.path.join(timit_path, "SA1.WAV")

waveform, sample_rate = torchaudio.load(audio_file)
waveform = waveform.squeeze()

if sample_rate != 16000:
    waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")

with torch.no_grad():
    predicted_ids = model.generate(inputs["input_features"], max_length=50)

predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print("Predicted Text:", predicted_text)

ground_truth_text = "She had your dark suit in greasy wash water all year"


In [None]:
from jiwer import wer, mer, wil
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

error_rate = wer(ground_truth_text.lower(), predicted_text.lower())
print("Word Error Rate:", error_rate)

match_error_rate = mer(ground_truth_text.lower(), predicted_text.lower())
wil_rate = wil(ground_truth_text.lower(), predicted_text.lower())
print("Match Error Rate:", match_error_rate)
print("Word Information Lost Rate:", wil_rate)

def visualize_pronunciation(ground_truth, prediction):
    ground_truth_words = ground_truth.split()
    predicted_words = prediction.split()
    
    fig, ax = plt.subplots()
    for idx, word in enumerate(ground_truth_words):
        color = 'green' if idx < len(predicted_words) and word.lower() == predicted_words[idx].lower() else 'red'
        ax.text(idx * 0.1, 0.5, word, color=color, fontsize=12, ha='center')
    ax.axis('off')
    plt.show()

visualize_pronunciation(ground_truth_text, predicted_text)

In [None]:

def plot_performance_metrics(accuracy, precision, recall, f1):
    metrics = {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}
    
    plt.figure(figsize=(8, 5))
    sns.barplot(x=list(metrics.keys()), y=list(metrics.values()), palette="Blues_d")
    plt.ylim(0, 1)
    plt.title("Pronunciation Prediction Model Performance Metrics")
    plt.ylabel("Score")
    plt.xlabel("Metric")
    plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_labels(ground_truth, prediction):
    ground_truth_words = ground_truth.split()
    predicted_words = prediction.split()
    labels = [1 if gt.lower() == pd.lower() else 0 for gt, pd in zip(ground_truth_words, predicted_words)]
    return labels

labels = get_labels(ground_truth_text, predicted_text)

accuracy = accuracy_score([1]*len(labels), labels)
precision = precision_score([1]*len(labels), labels)
recall = recall_score([1]*len(labels), labels)
f1 = f1_score([1]*len(labels), labels)

plot_performance_metrics(accuracy, precision, recall, f1)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_labels(ground_truth, prediction):
    ground_truth_words = ground_truth.split()
    predicted_words = prediction.split()
    labels = [1 if gt.lower() == pd.lower() else 0 for gt, pd in zip(ground_truth_words, predicted_words)]
    return labels

labels = get_labels(ground_truth_text, predicted_text)

accuracy = accuracy_score([1]*len(labels), labels)
precision = precision_score([1]*len(labels), labels)
recall = recall_score([1]*len(labels), labels)
f1 = f1_score([1]*len(labels), labels)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
import matplotlib.pyplot as plt
import torchaudio

def visualize_audio_with_text_overlay(waveform, ground_truth, prediction, sample_rate=16000):
    ground_truth_words = ground_truth.split()
    predicted_words = prediction.split()

    total_time = waveform.size(0) / sample_rate
    word_times = np.linspace(0, total_time, len(ground_truth_words) + 1)

    fig, ax = plt.subplots()
    ax.plot(np.linspace(0, total_time, waveform.size(0)), waveform.numpy(), label="Audio Signal")
    
    for idx, word in enumerate(ground_truth_words):
        color = 'green' if idx < len(predicted_words) and word.lower() == predicted_words[idx].lower() else 'red'
        ax.text(word_times[idx], waveform.max(), word, color=color, fontsize=9, ha='center', va='bottom')
    
    ax.set_xlabel("Time (s)")
    ax.set_ylabel("Amplitude")
    plt.legend()
    plt.show()

visualize_audio_with_text_overlay(waveform, ground_truth_text, predicted_text)


In [None]:
import whisper
import numpy as np
from jiwer import wer, cer
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt

# Load Whisper model
model = whisper.load_model("base")

def transcribe_audio(audio_path):
    # Transcribe audio file using Whisper
    result = model.transcribe(audio_path)
    transcription = result["text"]
    return transcription

def evaluate_pronunciation(pred_transcription, expected_transcription):
    # Split transcription into words for comparison
    pred_words = pred_transcription.lower().split()
    expected_words = expected_transcription.lower().split()

    # Calculate WER and CER
    wer_score = wer(expected_transcription, pred_transcription)
    cer_score = cer(expected_transcription, pred_transcription)

    # Calculate Precision, Recall, and F1
    common_words = set(pred_words) & set(expected_words)
    y_true = [1 if word in common_words else 0 for word in expected_words]
    y_pred = [1 if word in common_words else 0 for word in pred_words]
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary', zero_division=1
    )

    return {
        "WER": wer_score,
        "CER": cer_score,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

def highlight_transcription(pred_transcription, expected_transcription):
    # Highlight correct and incorrect words
    pred_words = pred_transcription.lower().split()
    expected_words = expected_transcription.lower().split()
    highlighted_text = []

    for pred_word, exp_word in zip(pred_words, expected_words):
        if pred_word == exp_word:
            highlighted_text.append(f"\033[92m{pred_word}\033[0m")  # Green for correct
        else:
            highlighted_text.append(f"\033[91m{pred_word}\033[0m")  # Red for incorrect
    return " ".join(highlighted_text)

def plot_metrics(metrics_dict):
    # Create bar plot for evaluation metrics
    labels = list(metrics_dict.keys())
    values = list(metrics_dict.values())

    plt.figure(figsize=(10, 6))
    plt.bar(labels, values, color=['blue', 'orange', 'green', 'red', 'purple'])
    plt.title("Pronunciation Evaluation Metrics")
    plt.xlabel("Metrics")
    plt.ylabel("Score")
    plt.ylim(0, 1)
    for i, v in enumerate(values):
        plt.text(i, v + 0.02, f"{v:.2f}", ha='center', fontweight='bold')
    plt.show()

# Define paths and expected transcription
audio_path = r"D:\voice-based-pronunciation-prediction\timit\data\TRAIN\DR1\FCJF0\SA1.WAV"
expected_transcription = "She had your dark suit in greasy wash water all year"

# Transcribe and evaluate
pred_transcription = transcribe_audio(audio_path)
metrics = evaluate_pronunciation(pred_transcription, expected_transcription)

# Display highlighted transcription with correct/incorrect words
highlighted_text = highlight_transcription(pred_transcription, expected_transcription)
print("Highlighted Transcription:", highlighted_text)

# Print evaluation metrics
print("Evaluation Metrics:", metrics)

# Plot metrics
plot_metrics(metrics)


  checkpoint = torch.load(fp, map_location=device)


FileNotFoundError: [WinError 2] The system cannot find the file specified