In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import os

BASE_DIR = "/content/drive/MyDrive/dentist_project"
GT_DIR = os.path.join(BASE_DIR, "conversations")
TR_DIR = os.path.join(BASE_DIR, "transcripts")


In [10]:
import re, os

def normalize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

for file in ["Conversation1.txt", "Conversation2.txt"]:
    path = os.path.join(GT_DIR, file)
    text = open(path, encoding="utf-8").read()
    open(path, "w", encoding="utf-8").write(normalize(text))

print("Gold-standard references normalized")


Gold-standard references normalized


**Cohen’s Kappa**

In [8]:
from sklearn.metrics import cohen_kappa_score
import os

BASE_DIR = "/content/drive/MyDrive/dentist_project/conversations"

a1 = open(
    os.path.join(BASE_DIR, "Conversation1.txt"),
    encoding="utf-8"
).read().split()

a2 = open(
    os.path.join(BASE_DIR, "Conversation2.txt"),
    encoding="utf-8"
).read().split()

# Align lengths
min_len = min(len(a1), len(a2))

kappa = cohen_kappa_score(a1[:min_len], a2[:min_len])

print("Inter-Annotator Agreement:", round(kappa, 2))


Inter-Annotator Agreement: 0.03


**Stratified WER**

In [13]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [14]:
from jiwer import wer

pairs = {
    "Conversation1": ("Conversation1.txt", "transcript_conversation1.txt"),
    "Conversation2": ("Conversation2.txt", "transcript_conversation2.txt")
}

for label, (gt_f, tr_f) in pairs.items():
    gt = open(os.path.join(GT_DIR, gt_f), encoding="utf-8").read()
    tr = open(os.path.join(TR_DIR, tr_f), encoding="utf-8").read()
    print(label, "WER:", round(wer(gt, tr), 3))


Conversation1 WER: 0.428
Conversation2 WER: 0.366


**Paired t-test on WER values**

In [15]:
from scipy.stats import ttest_rel

# Example WER values from Conversation 1 & 2
whisper_wer = [0.275, 0.279]
medasr_wer  = [0.894, 0.913]

t_stat, p_val = ttest_rel(whisper_wer, medasr_wer)

print("T-statistic:", round(t_stat, 3))
print("P-value:", round(p_val, 4))


T-statistic: -83.533
P-value: 0.0076


**Continuous evaluation logger**

In [16]:
import csv, time

log_path = os.path.join(BASE_DIR, "metrics_log.csv")

with open(log_path, "a", newline="") as f:
    writer = csv.writer(f)
    writer.writerow([
        time.strftime("%Y-%m-%d %H:%M"),
        "Conversation1",
        "Whisper",
        0.275
    ])

print("Evaluation logged")


Evaluation logged
