In [5]:
import torch

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [14]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_storage=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
)



In [7]:
from optibits.loader import load_model_and_tokenizer
# Test with different models
model, tokenizer = load_model_and_tokenizer("gpt2", device)

Using pad_token, but it is not set yet.


In [12]:
import bitsandbytes as bnb

def apply_quantization(model, quant_type="4bit"):
    """
    Applies quantization to a model **after** loading.
    
    Args:
        model (torch.nn.Module): The preloaded model to quantize.
        quant_type (str): "8bit", "4bit", or "none".
    
    Returns:
        Quantized model
    """
    if quant_type == "8bit":
        print("[INFO] Applying 8-bit quantization...")
        model = bnb.nn.Linear8bitLt.convert(model)
    elif quant_type == "4bit":
        print("[INFO] Applying 4-bit quantization...")
        model = bnb.nn.Linear4bit.convert(model)
    else:
        print("[INFO] No quantization applied.")
    return model



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: Required library version not found: libsbitsandbytes_cpu.so. Maybe you need to compile it from source?
CUDA SETUP: Defaulting to libbitsandbytes_cpu.so...
dlopen(/Users/stevengong/mambaforge/envs/etched/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so, 0x0006): tried: '/Users/stevengong/mambaforge/envs/etched/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (not a mach-o file), '/System/Volumes/Preboot/Cryptexes/OS/Users/stevengong/mambaforge/envs/etched/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (no such file), '/Users/stevengong/mambaforge/envs/etched/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (not a mach-o file)
CUDA SETUP: Required library version not found: libsbitsandbytes_cpu.so. Maybe you need to compile it from source?
CUDA SETUP: Defaulting to libbitsandbytes_

  warn("The installed version of bitsandbytes was compiled without GPU support. "


## Benchmark

In [13]:
apply_quantization(model, "4bit")

[INFO] Applying 4-bit quantization...


In [8]:
from optibits.benchmark import benchmark_latency
benchmark_latency(model, tokenizer, device=device, fp16=False)  # FP32 baseline
# benchmark_latency(model, tokenizer, device=device, fp16=True)  # FP16 optimization

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: gpt2 | Batch Size: 8 | FP16: False | First Inference Time: 1.7614s | Average Inference Time: 1.6601s


### Eval

In [9]:
from datasets import load_dataset

# Load Wikitext-2 dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
texts = dataset["text"]  # Extract text samples

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
from optibits.eval import calculate_perplexity, evaluate_multiple_choice_dataset, format_prompt_mmlu
# Run the evaluation
perplexity = calculate_perplexity(model, tokenizer, texts[:100])  # Sample 100 texts
print(f"GPT-2 Perplexity: {perplexity:.2f}")

GPT-2 Perplexity: 2941.86


In [15]:
evaluate_multiple_choice_dataset(
    model, tokenizer,
    dataset_name="cais/mmlu",
    subject="all",
    split="test",
    num_samples=100,
    format_prompt=format_prompt_mmlu
)

Evaluating cais/mmlu:   0%|          | 0/7 [01:22<?, ?it/s]
