# Quantized vs. FP16 Accuracy Check

This streamlined notebook compares a standard FP16 model against an 8-bit quantized variant on a small IMDB sentiment subset.


In [None]:
import torch
from datasets import DatasetDict, load_dataset
from pathlib import Path
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score, classification_report
from typing import Dict



In [None]:
# Load a manageable IMDB subset for quick experimentation
sample_fraction = 0.02
raw_datasets = load_dataset("imdb")
imdb_small = DatasetDict({
    split: ds.shuffle(seed=42).select(list(range(max(1, int(len(ds) * sample_fraction)))))
    for split, ds in raw_datasets.items()
})

for split, ds in imdb_small.items():
    print(f"{split}: {len(ds)} examples")


In [None]:
train_split = imdb_small['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = train_split['train']
eval_dataset = train_split['test']

print(f"Train subset: {len(train_dataset)} examples")
print(f"Validation subset: {len(eval_dataset)} examples")


In [None]:
def get_device() -> torch.device:
    return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def get_gpu_memory_mb() -> float:
    if not torch.cuda.is_available():
        return 0.0
    torch.cuda.empty_cache()
    return torch.cuda.memory_allocated() / 1024 ** 2

def evaluate_model(model: AutoModelForSequenceClassification, dataset, tokenizer, description: str, batch_size: int = 32) -> Dict[str, float]:
    model.eval()
    device = get_device()

    all_preds = []
    all_labels = []

    for start in range(0, len(dataset), batch_size):
        batch = dataset[start:start + batch_size]
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)
        labels = torch.tensor(batch["label"], dtype=torch.long)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)

        with torch.inference_mode():
            logits = model(**inputs).logits
        preds = logits.argmax(dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, digits=4)

    print(f"\n{description} accuracy: {accuracy:.4f}")
    print(report)
    return {
        "accuracy": accuracy,
        "report": report,
        "predictions": all_preds,
        "labels": all_labels,
    }


In [None]:
device = get_device()
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

supports_bf16 = bool(torch.cuda.is_available() and getattr(torch.cuda, 'is_bf16_supported', lambda: False)())
print(f"Using device: {device}")
print(f"bfloat16 support: {supports_bf16}")

base_before = get_gpu_memory_mb()
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
)
base_model.to(device)
base_after = get_gpu_memory_mb()
print(f"Base model memory footprint: {base_after - base_before:.2f} MB")


In [None]:
def tokenize_batch(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_train = train_dataset.map(
    tokenize_batch,
    batched=True,
    remove_columns=['text'],
)
tokenized_eval = eval_dataset.map(
    tokenize_batch,
    batched=True,
    remove_columns=['text'],
)

columns = ['input_ids', 'attention_mask', 'label']
tokenized_train.set_format(type='torch', columns=columns)
tokenized_eval.set_format(type='torch', columns=columns)

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8 if device.type == 'cuda' else None,
)

supports_bf16 = bool(torch.cuda.is_available() and getattr(torch.cuda, 'is_bf16_supported', lambda: False)())
training_args = TrainingArguments(
    output_dir='modernbert-imdb-training',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=10,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    load_best_model_at_end=True,
    bf16=supports_bf16,
    fp16=bool(torch.cuda.is_available() and not supports_bf16),
    report_to=[],
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=-1)
    return {'accuracy': accuracy_score(labels, preds)}

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

train_result = trainer.train()
print(f'Training metrics: {train_result.metrics}')
eval_metrics = trainer.evaluate()
print(f'Validation metrics: {eval_metrics}')

finetuned_dir = Path('modernbert-imdb-finetuned')
trainer.save_model(finetuned_dir)
tokenizer.save_pretrained(finetuned_dir)
print(f'Saved fine-tuned model to {finetuned_dir.resolve()}')


In [None]:
import gc

del train_result
del trainer
del tokenized_train
del tokenized_eval
del base_model
gc.collect()

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f'Cleared GPU memory. Current allocation: {get_gpu_memory_mb():.2f} MB')
else:
    print('No GPU available; nothing to clear.')


In [None]:
finetuned_dir = Path('modernbert-imdb-finetuned')
device = get_device()
tokenizer = AutoTokenizer.from_pretrained(finetuned_dir)

supports_bf16 = bool(torch.cuda.is_available() and getattr(torch.cuda, 'is_bf16_supported', lambda: False)())
ft_dtype = (
    torch.bfloat16 if supports_bf16
    else torch.float16 if device.type == 'cuda'
    else torch.float32
)

ft_before = get_gpu_memory_mb()
ft_model = AutoModelForSequenceClassification.from_pretrained(
    finetuned_dir,
    torch_dtype=ft_dtype,
    num_labels=2,
)
ft_model.to(device)
ft_after = get_gpu_memory_mb()
print(f'Fine-tuned FP model memory footprint: {ft_after - ft_before:.2f} MB')

quant_model = None
quant_before = quant_after = 0.0
if device.type == 'cuda':
    try:
        quant_config = BitsAndBytesConfig(load_in_8bit=True)
        quant_before = get_gpu_memory_mb()
        quant_model = AutoModelForSequenceClassification.from_pretrained(
            finetuned_dir,
            quantization_config=quant_config,
            device_map='auto',
        )
        quant_after = get_gpu_memory_mb()
        print(f'Quantized model memory footprint: {quant_after - quant_before:.2f} MB')
    except Exception as exc:
        quant_model = None
        print(f'Quantized loading failed: {exc}')
else:
    print('Quantized loading skipped (requires CUDA + bitsandbytes).')


In [None]:
test_dataset = imdb_small['test']
fp_results = evaluate_model(ft_model, test_dataset, tokenizer, 'Fine-tuned FP model')

if quant_model is not None:
    quant_results = evaluate_model(quant_model, test_dataset, tokenizer, 'Fine-tuned 8-bit quantized model')
    delta = quant_results['accuracy'] - fp_results['accuracy']
    print(f'Accuracy delta (quantized - fine-tuned FP): {delta:+.4f}')
else:
    print('Quantized model was not created, so only the fine-tuned FP baseline is reported.')
