# 1. Quantize the distilled student model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from transformers import AutoModelForSequenceClassification

# Load the distilled student model

student_model_path = "/content/drive/MyDrive/Colab Notebooks/Postdoc/results/student/tiny-v2/checkpoint-epoch6"
student_model_fp32 = AutoModelForSequenceClassification.from_pretrained(student_model_path)
student_model_fp32.eval()  # Always eval mode for quantization

device = torch.device("cpu")

In [None]:
import pandas as pd
from datasets import Dataset

# Prepare dataset

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Postdoc/cleaned_stripped_mimic_notes.csv")
print(df.shape)

dataset = Dataset.from_pandas(df[["clean_relevant_note_truncate", "label"]])

# Split data into train and temp
temp_split = dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Then split temp into val and test
val_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

(51695, 4)


In [None]:
# Quantize Linear (fully connected) layers
student_model_quantized = torch.quantization.quantize_dynamic(
    student_model_fp32,  # model
    {torch.nn.Linear},   # layers to quantize
    dtype=torch.qint8    # quantize to int8
)

print(student_model_quantized)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): DynamicQuantizedLinear(in_features=128, out_features=128, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=128, out_features=128, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=128, out_features=128, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
# Save quantized model

quantized_model_path = student_model_path + "-quantized/"
torch.save(student_model_quantized.state_dict(), quantized_model_path + "quantized_student_model.pt")

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Function to evaluate model on test set

def evaluate_model_on_test_set_raw(model, test_dataset, tokenizer_name="gaunernst/bert-tiny-uncased", batch_size=8):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # Create DataLoader that tokenizes on-the-fly
    def collate_fn(batch):
        texts = [x["clean_relevant_note_truncate"] for x in batch]
        labels = torch.tensor([x["label"] for x in batch])
        encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
        return encodings, labels

    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Move model to device
    model.eval()
    model.to(device)

    all_logits = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)

            outputs = model(**inputs)
            logits = outputs.logits

            all_logits.append(logits.cpu())
            all_labels.append(labels.cpu())

    # Concatenate results
    logits = torch.cat(all_logits, dim=0)
    labels = torch.cat(all_labels, dim=0)
    probs = torch.nn.functional.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1).numpy()

    # Compute metrics
    labels_np = labels.numpy()
    acc = accuracy_score(labels_np, preds)
    prec = precision_score(labels_np, preds)
    rec = recall_score(labels_np, preds)
    f1 = f1_score(labels_np, preds)
    auc = roc_auc_score(labels_np, probs[:, 1].numpy())
    cm = confusion_matrix(labels_np, preds)

    # Display results
    print("🔍 Test Set Evaluation:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"AUROC:     {auc:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

    print("\nDetailed Classification Report:")
    print(classification_report(labels_np, preds, digits=4))

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auroc": auc,
        "confusion_matrix": cm
    }


In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

# Load tokenizer (same as before)
tokenizer = AutoTokenizer.from_pretrained("gaunernst/bert-tiny-uncased")

# Load quantized model
student_model_quantized.eval()
student_model_quantized.to("cpu")  # Quantized models should run on CPU

# Evaluate quantized model performance
results = evaluate_model_on_test_set_raw(
    model=student_model_quantized,
    test_dataset=test_dataset,
    tokenizer_name="gaunernst/bert-tiny-uncased",
)

print("📊 Quantized model performance:")
for k, v in results.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

Evaluating: 100%|██████████| 970/970 [01:32<00:00, 10.47it/s]

🔍 Test Set Evaluation:
Accuracy:  0.7812
Precision: 0.6243
Recall:    0.7280
F1 Score:  0.6722
AUROC:     0.8402

Confusion Matrix:
[[4318 1047]
 [ 650 1740]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8692    0.8048    0.8358      5365
           1     0.6243    0.7280    0.6722      2390

    accuracy                         0.7812      7755
   macro avg     0.7467    0.7664    0.7540      7755
weighted avg     0.7937    0.7812    0.7854      7755

📊 Quantized model performance:
accuracy: 0.7812
precision: 0.6243
recall: 0.7280
f1: 0.6722
auroc: 0.8402
confusion_matrix: [[4318 1047]
 [ 650 1740]]





# 2. Quantize the pruned student model

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

# Load the pruned student model

student_model_path = "/content/drive/MyDrive/Colab Notebooks/Postdoc/BERT-tiny-student-model-pruned/v1"
student_model_fp32 = AutoModelForSequenceClassification.from_pretrained(student_model_path)
student_model_fp32.eval()  # Always eval mode for quantization

device = torch.device("cpu")

In [None]:
# Quantize Linear (fully connected) layers
student_model_quantized = torch.quantization.quantize_dynamic(
    student_model_fp32,  # model
    {torch.nn.Linear},   # layers to quantize
    dtype=torch.qint8    # quantize to int8
)

print(student_model_quantized)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): DynamicQuantizedLinear(in_features=128, out_features=64, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=128, out_features=64, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=128, out_features=64, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0.1, inplace=False)
            )

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

# Load tokenizer (same as before)
tokenizer = AutoTokenizer.from_pretrained("gaunernst/bert-tiny-uncased")

# Load quantized model
student_model_quantized.eval()
student_model_quantized.to("cpu")  # Quantized models should run on CPU

# Evaluate quantized model performance
results = evaluate_model_on_test_set_raw(
    model=student_model_quantized,
    test_dataset=test_dataset,
    tokenizer_name="gaunernst/bert-tiny-uncased",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/32.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Evaluating: 100%|██████████| 970/970 [01:31<00:00, 10.64it/s]


🔍 Test Set Evaluation:
Accuracy:  0.7683
Precision: 0.6216
Recall:    0.6343
F1 Score:  0.6279
AUROC:     0.8108

Confusion Matrix:
[[4442  923]
 [ 874 1516]]

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8356    0.8280    0.8318      5365
           1     0.6216    0.6343    0.6279      2390

    accuracy                         0.7683      7755
   macro avg     0.7286    0.7311    0.7298      7755
weighted avg     0.7696    0.7683    0.7689      7755



In [None]:
# Save quantized pruned model
torch.save(student_model_quantized.state_dict(), "/content/drive/MyDrive/Colab Notebooks/Postdoc/BERT-tiny-student-model-pruned/quantized-v1/quantized-pruned_model.pt")