In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# Measure all models using CPU, as quantized models run best on CPU
device = torch.device("cpu")

# Measure inference time per example over several batches of examples
def measure_inference_time(model, tokenizer, texts, batch_size=8, device="cpu", warmup=3):
    model.to(device)
    model.eval()

    encoded_batches = [
        tokenizer(texts[i:i+batch_size], padding=True, truncation=True, max_length=512, return_tensors="pt")
        for i in range(0, len(texts), batch_size)
    ]

    for batch in encoded_batches:
        for k in batch:
            batch[k] = batch[k].to(device)

    # Warm-up to stabilize timing
    with torch.no_grad():
        for _ in range(warmup):
            for batch in encoded_batches:
                _ = model(**batch)

    # Timed inference with per-batch tracking
    batch_times = []
    with torch.no_grad():
        for batch in encoded_batches:
            start = time.time()
            _ = model(**batch)
            end = time.time()
            batch_time = end - start
            batch_times.append(batch_time / batch["input_ids"].shape[0])  # time per sample in this batch

    per_sample_times = np.array(batch_times)
    mean_time = per_sample_times.mean()
    std_time = per_sample_times.std(ddof=1)  # sample std dev
    stderr = std_time / np.sqrt(len(per_sample_times))  # standard error of the mean

    print(f"✅ Processed {len(texts)} samples")
    print(f"⏱️ Avg inference time per sample: {mean_time*1000:.2f} ms ± {stderr*1000:.2f} ms (SE)")
    return mean_time, stderr

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from datasets import Dataset

# Upload data and prepare test dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Postdoc/cleaned_stripped_mimic_notes.csv")
print(df.shape)

dataset = Dataset.from_pandas(df[["clean_relevant_note_truncate", "label"]])

# Split data into train and temp
temp_split = dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Then split temp into val and test
val_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

(51695, 4)


In [None]:
# This is 10 batches of 8 examples each
texts = [example['clean_relevant_note_truncate'] for example in test_dataset.select(range(80))]

# Bio_ClinicalBERT (teacher) model
teacher_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Postdoc/Bio_ClinicalBERT-fine_tuned/v3-final")
teacher_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

print("📏 Measuring Teacher (Bio_ClinicalBERT)...")
measure_inference_time(teacher_model, teacher_tokenizer, texts, device="cpu")

📏 Measuring Teacher (Bio_ClinicalBERT)...
✅ Processed 80 samples
⏱️ Avg inference time per sample: 1795.76 ms ± 9.40 ms (SE)


(np.float64(1.7957602798938752), np.float64(0.009400869063318789))

In [None]:
# This is 100 batches of 8 examples each
texts = [example['clean_relevant_note_truncate'] for example in test_dataset.select(range(800))]

# BERT-tiny distilled (student) model
student_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Postdoc/results/student/tiny-v2/checkpoint-epoch4")
student_tokenizer = AutoTokenizer.from_pretrained("gaunernst/bert-tiny-uncased")

print("📏 Measuring Student (BERT-tiny)...")
measure_inference_time(student_model, student_tokenizer, texts, device="cpu")

tokenizer_config.json:   0%|          | 0.00/32.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

📏 Measuring Student (BERT-tiny)...
✅ Processed 800 samples
⏱️ Avg inference time per sample: 18.00 ms ± 0.62 ms (SE)


(np.float64(0.017996739745140076), np.float64(0.0006233525929240135))

In [None]:
# BERT-Tiny trained from scratch
student_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Postdoc/BERT-tiny-student-model-no-teacher/v1")
student_tokenizer = AutoTokenizer.from_pretrained("gaunernst/bert-tiny-uncased")

print("📏 Measuring Student No Teacher (BERT-tiny)...")
measure_inference_time(student_model, student_tokenizer, texts, device="cpu")

📏 Measuring Student No Teacher (BERT-tiny)...
✅ Processed 800 samples
⏱️ Avg inference time per sample: 17.57 ms ± 0.38 ms (SE)


(np.float64(0.01756890833377838), np.float64(0.00037502835523347394))

In [None]:
# Quantized BERT-Tiny (which is saved differently)

# Initialize same architecture
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Postdoc/results/student/tiny-v2/checkpoint-epoch4")

# Apply quantization again
model_quantized = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8)

# Load weights
model_quantized.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/Postdoc/results/student/tiny-v2/checkpoint-epoch6-quantized/quantized_student_model.pt"))
model_quantized.eval()

# Quantized Student
print("📏 Measuring Quantized Student...")
measure_inference_time(model_quantized, student_tokenizer, texts, device="cpu")

  device=storage.device,


📏 Measuring Quantized Student...
✅ Processed 800 samples
⏱️ Avg inference time per sample: 17.67 ms ± 0.34 ms (SE)


(np.float64(0.017669521272182465), np.float64(0.0003446459803976306))

In [None]:
# Pruned BERT-Tiny
pruned_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Postdoc/BERT-tiny-student-model-pruned/v1")
student_tokenizer = AutoTokenizer.from_pretrained("gaunernst/bert-tiny-uncased")

print("📏 Measuring Pruned (BERT-tiny)...")
measure_inference_time(pruned_model, student_tokenizer, texts, device="cpu")

📏 Measuring Pruned (BERT-tiny)...
✅ Processed 800 samples
⏱️ Avg inference time per sample: 13.04 ms ± 0.30 ms (SE)


(np.float64(0.013039761781692505), np.float64(0.0003019465856384267))

In [None]:
# Quantized pruned BERT-Tiny

# Initialize same architecture
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Postdoc/BERT-tiny-student-model-pruned/v1")

# Apply quantization again
model_quantized = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8)

# Load weights
model_quantized.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/Postdoc/BERT-tiny-student-model-pruned/quantized-v1/quantized-pruned_model.pt"))
model_quantized.eval()

# Quantized Pruned
print("📏 Measuring Quantized Student...")
measure_inference_time(model_quantized, student_tokenizer, texts, device="cpu")

📏 Measuring Quantized Student...
✅ Processed 800 samples
⏱️ Avg inference time per sample: 11.77 ms ± 0.24 ms (SE)


(np.float64(0.011773796379566192), np.float64(0.00023675995658556815))