In [1]:
import torch
import torchaudio
import os

In [2]:
# Path to the TIMIT dataset audio file
audio_file = os.path.join("timit", "data", "TRAIN", "DR1", "FCJF0", "SA1.WAV")

# Load the audio file
waveform, sample_rate = torchaudio.load(audio_file)
waveform = waveform.squeeze()

# Resample to 16kHz if necessary
if sample_rate != 16000:
    waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

In [None]:
from transformers import Wav2Vec2Processor
from transformers import QuartzNetForCTC  # Hypothetical import as QuartzNet is usually available in NVIDIA's toolkit

# Replace with the correct model ID if QuartzNet is on Hugging Face; otherwise, use NVIDIA NeMo framework.
quartz_processor = Wav2Vec2Processor.from_pretrained("nvidia/quartznet15x5")
quartz_model = QuartzNetForCTC.from_pretrained("nvidia/quartznet15x5")

# Process and tokenize the waveform
inputs = quartz_processor(waveform, sampling_rate=16000, return_tensors="pt")

# Get the predicted text from QuartzNet
with torch.no_grad():
    logits = quartz_model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

# Decode the predicted ids to text
ground_truth_text = "She had your dark suit in greasy wash water all year"


predicted_text = quartz_processor.batch_decode(predicted_ids)[0]
print("QuartzNet Predicted Text:", predicted_text)


  torch.utils._pytree._register_pytree_node(


ImportError: cannot import name 'QuartzNetForCTC' from 'transformers' (d:\Anaconda\Lib\site-packages\transformers\__init__.py)

In [None]:
from jiwer import wer, mer, wil
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

error_rate = wer(ground_truth_text.lower(), predicted_text.lower())
print("Word Error Rate:", error_rate)

match_error_rate = mer(ground_truth_text.lower(), predicted_text.lower())
wil_rate = wil(ground_truth_text.lower(), predicted_text.lower())
print("Match Error Rate:", match_error_rate)
print("Word Information Lost Rate:", wil_rate)

def visualize_pronunciation(ground_truth, prediction):
    ground_truth_words = ground_truth.split()
    predicted_words = prediction.split()
    
    fig, ax = plt.subplots()
    for idx, word in enumerate(ground_truth_words):
        color = 'green' if idx < len(predicted_words) and word.lower() == predicted_words[idx].lower() else 'red'
        ax.text(idx * 0.1, 0.5, word, color=color, fontsize=12, ha='center')
    ax.axis('off')
    plt.show()

visualize_pronunciation(ground_truth_text, predicted_text)

In [None]:

def plot_performance_metrics(accuracy, precision, recall, f1):
    metrics = {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}
    
    plt.figure(figsize=(8, 5))
    sns.barplot(x=list(metrics.keys()), y=list(metrics.values()), palette="Blues_d")
    plt.ylim(0, 1)
    plt.title("Pronunciation Prediction Model Performance Metrics")
    plt.ylabel("Score")
    plt.xlabel("Metric")
    plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_labels(ground_truth, prediction):
    ground_truth_words = ground_truth.split()
    predicted_words = prediction.split()
    labels = [1 if gt.lower() == pd.lower() else 0 for gt, pd in zip(ground_truth_words, predicted_words)]
    return labels

labels = get_labels(ground_truth_text, predicted_text)

accuracy = accuracy_score([1]*len(labels), labels)
precision = precision_score([1]*len(labels), labels)
recall = recall_score([1]*len(labels), labels)
f1 = f1_score([1]*len(labels), labels)

plot_performance_metrics(accuracy, precision, recall, f1)
