This notebook shows how to quantize  models with BitsandBytes, AWQ, GPTQ, and AutoRound.

All these quantization methods run on consumer hardware and won't require a GPU with more than 24 GB of VRAM.


#AutoRound

In [None]:
!pip install --upgrade transformers auto-round

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

from auto_round import AutoRound


bits, group_size, sym = 4, 128, True
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, batch_size=2, seqlen=512, sym=sym, gradient_accumulate_steps=4, device='cuda')
autoround.quantize()
output_dir = "./AutoRound/GPTQ-sym/"
autoround.save_quantized(output_dir)

In [None]:
model.push_to_hub("Meta-Llama-3.1-8B-Instruct-autoround-4bit-sym", token = "...")
tokenizer.push_to_hub("Meta-Llama-3.1-8B-Instruct-autoround-4bit-sym", token = "...")

#GPTQ

In [None]:

!pip install --upgrade auto-gptq accelerate datasets optimum
!pip install --upgrade transformers


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.gptq import GPTQQuantizer
import torch

model_path = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
w = 4 #quantization to 4-bit. Change to 2, 3, or 8 to quantize with another precision

quant_path = 'Meta-Llama-3.1-8B-Instruct-gptq-'+str(w)+'bit'

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

quantizer = GPTQQuantizer(bits=w, dataset="c4", model_seqlen = 2048)
quantized_model = quantizer.quantize_model(model, tokenizer)

quantized_model.save_pretrained(".//GPTQ/"+quant_path, safetensors=True)
tokenizer.save_pretrained("./GPTQ/"+quant_path)

#Bitsandbytes



In [None]:
!pip install -U bitsandbytes


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
else:
  compute_dtype = torch.float16

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
quant_path = 'Meta-Llama-3.1-8B-Instruct-bnb-4bit'

tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config
)
model.save_pretrained("./BnB/"+quant_path, safetensors=True)
tokenizer.save_pretrained("./BnB/"+quant_path)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

('./BnB/Meta-Llama-3.1-8B-Instruct-bnb-4bit/tokenizer_config.json',
 './BnB/Meta-Llama-3.1-8B-Instruct-bnb-4bit/special_tokens_map.json',
 './BnB/Meta-Llama-3.1-8B-Instruct-bnb-4bit/tokenizer.json')

#AWQ

In [None]:
!pip install --upgrade autoawq optimum accelerate torch
!pip install --upgrade transformers

In [None]:
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM

model_path = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
quant_path = 'Meta-Llama-3.1-8B-Instruct-awq-4bit'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }


# Load model and tokenizer
model = AutoAWQForCausalLM.from_pretrained(model_path, safetensors=True, device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)


# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model with safetensors
model.save_quantized("./AWQ/"+quant_path, safetensors=True)
tokenizer.save_pretrained("./AWQ/"+quant_path)

