# **Weighted Error Rate:**

In [1]:
# Ground truth and ASR output
ground_truth = "patient prescribed insulin 10 units daily"
predicted_text = "patient prescribed insulin 100 units daily"

# Define weights
weights = {
    "medication": 3.0,
    "dosage": 5.0,
    "normal": 1.0
}

# Tokenize
gt_tokens = ground_truth.split()
pred_tokens = predicted_text.split()

weighted_errors = 0
total_weight = 0

for gt, pred in zip(gt_tokens, pred_tokens):
    # Identify critical tokens
    if gt.isdigit():
        weight = weights["dosage"]
    elif gt.lower() in ["insulin"]:
        weight = weights["medication"]
    else:
        weight = weights["normal"]

    total_weight += weight

    if gt != pred:
        weighted_errors += weight

weighted_error_rate = weighted_errors / total_weight

print("Weighted Error Rate:", weighted_error_rate)


Weighted Error Rate: 0.4166666666666667


# **Weighted Error:**

In [2]:
# Reference and predicted text
reference = "patient prescribed insulin 10 mg daily"
prediction = "patient prescribed insulin 100 mg daily"

# Error category weights
CATEGORY_WEIGHTS = {
    "medication": 10.0,
    "dosage": 10.0,
    "diagnosis": 6.0,
    "negation": 8.0,
    "laterality": 8.0,
    "article": 0.1,
    "normal": 1.0
}

# Simple token classification
MEDICATIONS = {"insulin"}
ARTICLES = {"a", "an", "the"}

ref_tokens = reference.split()
pred_tokens = prediction.split()

weighted_errors = 0
total_weight = 0

for ref, pred in zip(ref_tokens, pred_tokens):
    if ref.isdigit():
        category = "dosage"
    elif ref.lower() in MEDICATIONS:
        category = "medication"
    elif ref.lower() in ARTICLES:
        category = "article"
    else:
        category = "normal"

    weight = CATEGORY_WEIGHTS[category]
    total_weight += weight

    if ref != pred:
        weighted_errors += weight

weighted_error_rate = weighted_errors / total_weight
print("Weighted Error Rate:", weighted_error_rate)


Weighted Error Rate: 0.4166666666666667


# **Error Severity:**

In [3]:
import numpy as np

# Severity ratings from 5 clinicians (1 = low, 5 = critical)
# Example error: "Medication dosage error"
expert_scores = {
    "clinician_1": 5,
    "clinician_2": 4,
    "clinician_3": 5,
    "clinician_4": 4,
    "clinician_5": 5
}

scores = np.array(list(expert_scores.values()))

mean_score = np.mean(scores)
median_score = np.median(scores)
std_dev = np.std(scores)

print("Mean severity score:", mean_score)
print("Median severity score:", median_score)
print("Agreement (std deviation):", std_dev)

# Final severity label
if mean_score >= 4.5:
    severity = "Critical"
elif mean_score >= 3.5:
    severity = "High"
elif mean_score >= 2.5:
    severity = "Medium"
else:
    severity = "Low"

print("Final consensus severity:", severity)


Mean severity score: 4.6
Median severity score: 5.0
Agreement (std deviation): 0.48989794855663565
Final consensus severity: Critical


# **Validation:**

In [4]:
import numpy as np

# Feedback from clinicians (1 = incorrect, 5 = highly accurate)
clinical_feedback = {
    "doctor": 4,
    "nurse": 5,
    "pharmacist": 4,
    "resident": 5
}

scores = np.array(list(clinical_feedback.values()))

mean_score = np.mean(scores)
agreement = np.std(scores)

print("Mean clinical validation score:", mean_score)
print("Inter-reviewer agreement (std dev):", agreement)

if mean_score >= 4.5:
    status = "Validated for clinical use"
elif mean_score >= 3.5:
    status = "Conditionally acceptable"
else:
    status = "Requires revision"

print("Validation status:", status)


Mean clinical validation score: 4.5
Inter-reviewer agreement (std dev): 0.5
Validation status: Validated for clinical use


# **Cohenâ€™s Kappa:**

In [5]:
from sklearn.metrics import cohen_kappa_score

# Example: word-level correctness (1 = correct, 0 = incorrect)
annotator_1 = [1, 1, 0, 1, 1, 0]
annotator_2 = [1, 1, 1, 1, 0, 0]

kappa = cohen_kappa_score(annotator_1, annotator_2)
print("Cohen's Kappa:", kappa)


Cohen's Kappa: 0.25


# **Stratified Metric:**

In [6]:
import pandas as pd

# Example evaluation results
data = {
    "specialty": ["Cardiology", "Cardiology", "Surgery", "Surgery"],
    "audio_quality": ["Clean", "Noisy", "Clean", "Noisy"],
    "wer": [0.12, 0.25, 0.15, 0.35]
}

df = pd.DataFrame(data)

# Stratified evaluation
stratified_results = df.groupby(["specialty", "audio_quality"])["wer"].mean()
print(stratified_results)


specialty   audio_quality
Cardiology  Clean            0.12
            Noisy            0.25
Surgery     Clean            0.15
            Noisy            0.35
Name: wer, dtype: float64
