# **Confidence Scoring and Flagging:**

In [1]:

asr_output = [
    {"text": "Patient has hypertension", "confidence": 0.94},
    {"text": "and is prescribed", "confidence": 0.81},
    {"text": "metoprolol", "confidence": 0.62},  # low confidence
    {"text": "once daily", "confidence": 0.90}
]

THRESHOLD = 0.75

flagged_segments = []

for segment in asr_output:
    if segment["confidence"] < THRESHOLD:
        flagged_segments.append(segment)

print("Flagged segments for review:")
for seg in flagged_segments:
    print(seg)


Flagged segments for review:
{'text': 'metoprolol', 'confidence': 0.62}


# **Optimal Threshold:**

In [2]:
import numpy as np
from sklearn.metrics import f1_score

# Example confidence scores and ground truth
# 1 = correct transcription, 0 = incorrect
confidence_scores = np.array([0.95, 0.88, 0.72, 0.65, 0.55, 0.40])
ground_truth = np.array([1, 1, 1, 0, 0, 0])

thresholds = np.arange(0.4, 0.95, 0.05)
best_threshold = 0
best_f1 = 0

for t in thresholds:
    predictions = (confidence_scores >= t).astype(int)
    f1 = f1_score(ground_truth, predictions)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Best F1 Score:", best_f1)


Best Threshold: 0.7
Best F1 Score: 1.0


# **Confidence Calibration:**

In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve

# Raw confidence scores from ASR/NLP system
raw_confidence = np.array([0.95, 0.88, 0.72, 0.65, 0.55, 0.40]).reshape(-1, 1)

# Ground truth: 1 = correct, 0 = incorrect
labels = np.array([1, 1, 1, 0, 0, 0])

# Train Platt scaling model
platt = LogisticRegression()
platt.fit(raw_confidence, labels)

# Calibrated confidence scores
calibrated_confidence = platt.predict_proba(raw_confidence)[:, 1]

print("Raw confidence:", raw_confidence.flatten())
print("Calibrated confidence:", calibrated_confidence)


Raw confidence: [0.95 0.88 0.72 0.65 0.55 0.4 ]
Calibrated confidence: [0.52911517 0.52123685 0.50319585 0.49529752 0.48401924 0.46713587]


# **Visualizing FP/FN Trade-off:**

In [4]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Confidence scores from ASR/NLP system
confidence_scores = np.array([0.95, 0.88, 0.72, 0.65, 0.55, 0.40])

# Ground truth: 1 = correct, 0 = incorrect
ground_truth = np.array([1, 1, 1, 0, 0, 0])

def evaluate_threshold(threshold):
    # Predict "accept" (1) if confidence >= threshold, else "flag" (0)
    predictions = (confidence_scores >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(ground_truth, predictions).ravel()
    return {"threshold": threshold, "FP": fp, "FN": fn, "TP": tp, "TN": tn}

# Test multiple thresholds
thresholds = [0.5, 0.7, 0.85]
results = [evaluate_threshold(t) for t in thresholds]

for r in results:
    print(r)


{'threshold': 0.5, 'FP': np.int64(2), 'FN': np.int64(0), 'TP': np.int64(3), 'TN': np.int64(1)}
{'threshold': 0.7, 'FP': np.int64(0), 'FN': np.int64(0), 'TP': np.int64(3), 'TN': np.int64(3)}
{'threshold': 0.85, 'FP': np.int64(0), 'FN': np.int64(1), 'TP': np.int64(2), 'TN': np.int64(3)}


# **Real/Batch time TAT:**

In [5]:
import time

def transcribe(audio_length_seconds, mode="batch"):
    start = time.time()

    if mode == "realtime":
        time.sleep(audio_length_seconds * 0.1)  # simulate streaming delay
    else:
        time.sleep(audio_length_seconds * 0.5)  # simulate batch processing

    end = time.time()
    return end - start

# Simulate 30-second audio
print("Real-time TAT:", transcribe(30, "realtime"), "seconds")
print("Batch TAT:", transcribe(30, "batch"), "seconds")


Real-time TAT: 3.0035552978515625 seconds
Batch TAT: 15.000099182128906 seconds


# **Measuring Latency:**

In [6]:
import time

def simulate_transcription(latency_seconds):
    start = time.time()
    time.sleep(latency_seconds)
    return time.time() - start

print("Emergency use-case latency:", simulate_transcription(1), "seconds")
print("Batch documentation latency:", simulate_transcription(10), "seconds")


Emergency use-case latency: 1.0000929832458496 seconds
Batch documentation latency: 10.00217342376709 seconds


# **Accuracy-speed trade-off curve:**

In [7]:
import time
import random

def transcribe(mode="fast"):
    start = time.time()

    if mode == "fast":
        time.sleep(1)  # fast processing
        accuracy = random.uniform(0.85, 0.90)
    else:
        time.sleep(3)  # slower, more accurate
        accuracy = random.uniform(0.93, 0.97)

    latency = time.time() - start
    return latency, accuracy

fast_latency, fast_accuracy = transcribe("fast")
slow_latency, slow_accuracy = transcribe("accurate")

print("Fast mode → Latency:", fast_latency, "Accuracy:", fast_accuracy)
print("Accurate mode → Latency:", slow_latency, "Accuracy:", slow_accuracy)


Fast mode → Latency: 1.0001413822174072 Accuracy: 0.8742763420123676
Accurate mode → Latency: 3.0001702308654785 Accuracy: 0.9427962670556507
