# Quantized vs. FP16 Accuracy Check

This streamlined notebook compares a standard FP16 model against an 8-bit quantized variant on a small IMDB sentiment subset.


In [1]:
import torch
from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from sklearn.metrics import accuracy_score, classification_report
from typing import Dict

torch.set_grad_enabled(False)


  from .autonotebook import tqdm as notebook_tqdm


torch.autograd.grad_mode.set_grad_enabled(mode=False)

In [2]:
# Load a manageable IMDB subset for quick experimentation
sample_fraction = 0.02
raw_datasets = load_dataset("imdb")
imdb_small = DatasetDict({
    split: ds.shuffle(seed=42).select(list(range(max(1, int(len(ds) * sample_fraction)))))
    for split, ds in raw_datasets.items()
})

for split, ds in imdb_small.items():
    print(f"{split}: {len(ds)} examples")


train: 500 examples
test: 500 examples
unsupervised: 1000 examples


In [3]:
def get_device() -> torch.device:
    return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def get_gpu_memory_mb() -> float:
    if not torch.cuda.is_available():
        return 0.0
    torch.cuda.empty_cache()
    return torch.cuda.memory_allocated() / 1024 ** 2

def evaluate_model(model: AutoModelForSequenceClassification, dataset, tokenizer, description: str, batch_size: int = 32) -> Dict[str, float]:
    model.eval()
    device = get_device()

    all_preds = []
    all_labels = []

    for start in range(0, len(dataset), batch_size):
        batch = dataset[start:start + batch_size]
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)
        labels = torch.tensor(batch["label"], dtype=torch.long)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)

        with torch.inference_mode():
            logits = model(**inputs).logits
        preds = logits.argmax(dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, digits=4)

    print(f"\n{description} accuracy: {accuracy:.4f}")
    print(report)
    return {
        "accuracy": accuracy,
        "report": report,
        "predictions": all_preds,
        "labels": all_labels,
    }


In [4]:
device = get_device()
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Using device: {device}")
base_before = get_gpu_memory_mb()
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
)
base_model.to(device)
base_after = get_gpu_memory_mb()
print(f"Base model memory footprint: {base_after - base_before:.2f} MB")

quant_model = None
quant_before = quant_after = 0.0
if device.type == "cuda":
    try:
        quant_config = BitsAndBytesConfig(load_in_8bit=True)
        quant_before = get_gpu_memory_mb()
        quant_model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,
            quantization_config=quant_config,
            device_map="auto",
        )
        quant_after = get_gpu_memory_mb()
        print(f"Quantized model memory footprint: {quant_after - quant_before:.2f} MB")
    except Exception as exc:
        quant_model = None
        print(f"Quantized loading failed: {exc}")
else:
    print("Quantized loading skipped (requires CUDA + bitsandbytes).")


`torch_dtype` is deprecated! Use `dtype` instead!


Using device: cuda


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model memory footprint: 295.21 MB


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Quantized model memory footprint: 180.44 MB


In [5]:
test_dataset = imdb_small["test"]
base_results = evaluate_model(base_model, test_dataset, tokenizer, "FP16 base model")

if quant_model is not None:
    quant_results = evaluate_model(quant_model, test_dataset, tokenizer, "8-bit quantized model")
    delta = quant_results["accuracy"] - base_results["accuracy"]
    print(f"Accuracy delta (quantized - base): {delta:+.4f}")
else:
    print("Quantized model was not created, so only the FP16 baseline is reported.")



FP16 base model accuracy: 0.5140
              precision    recall  f1-score   support

           0     0.5112    0.9882    0.6738       254
           1     0.6667    0.0244    0.0471       246

    accuracy                         0.5140       500
   macro avg     0.5889    0.5063    0.3604       500
weighted avg     0.5877    0.5140    0.3655       500


8-bit quantized model accuracy: 0.4720
              precision    recall  f1-score   support

           0     0.4038    0.0827    0.1373       254
           1     0.4799    0.8740    0.6196       246

    accuracy                         0.4720       500
   macro avg     0.4419    0.4783    0.3784       500
weighted avg     0.4413    0.4720    0.3746       500

Accuracy delta (quantized - base): -0.0420
