## TinyLLama Multi-bit GPTQ Quantization - Jupyter Notebook
### This notebook will save the original model locally and create 8-bit, 4-bit, and 2-bit quantized versions

In [15]:
import os
import torch
import warnings

# Set these BEFORE importing GPTQModel
os.environ['DISABLE_TRITON'] = '1'
os.environ['TORCHDYNAMO_DISABLE'] = '1'
os.environ['TORCH_COMPILE_DISABLE'] = '1'
os.environ['PYTORCH_DISABLE_DYNAMO'] = '1'

# Suppress warnings
warnings.filterwarnings('ignore')

import torch
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
import logging
import os
import json
from datetime import datetime
import shutil

In [12]:
# Quantisation environment set up
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.7.1+cu128
CUDA available: True
GPU: Quadro RTX 3000





### Model configuration


In [16]:
model_id = "TinyLlama/TinyLlama_v1.1"
base_output_dir = "TinyLlama-models"
original_model_dir = f"{base_output_dir}/original"
output_dir_8= f"{base_output_dir}/TinyLlama-gptqmodel-8bit"
output_dir_4= f"{base_output_dir}/TinyLlama-gptqmodel-4bit"
output_dir_2= f"{base_output_dir}/TinyLlama-gptqmodel-2bit"

# Quantization configurations
quantize_config_8 = QuantizeConfig(
        bits=8,  # quantize model to 8-bit
        group_size=128,  # it is recommended to set the value to 128
    )

quantize_config_4 = QuantizeConfig(
        bits=4,  # quantize model to 4-bit
        group_size=128,  # it is recommended to set the value to 128
    )

quantize_config_2 = QuantizeConfig(
        bits=2,  # quantize model to 2-bit
        group_size=64,  # it is recommended to set the value to 128
    )

print(f"Configuration:")
print(f"Source model: {model_id}")
print(f"Base output directory: {base_output_dir}")
print(f"Q8 output directory: {output_dir_8}")
print(f"Q4 output directory: {output_dir_4}")
print(f"Q2 output directory: {output_dir_2}")

# Create output directories
os.makedirs(base_output_dir, exist_ok=True)
os.makedirs(output_dir_8, exist_ok=True)
os.makedirs(output_dir_4, exist_ok=True)
os.makedirs(output_dir_2, exist_ok=True)


Configuration:
Source model: TinyLlama/TinyLlama_v1.1
Base output directory: TinyLlama-models
Q8 output directory: TinyLlama-models/TinyLlama-gptqmodel-8bit
Q4 output directory: TinyLlama-models/TinyLlama-gptqmodel-4bit
Q2 output directory: TinyLlama-models/TinyLlama-gptqmodel-2bit


In [17]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load original model
original_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Save original model locally
print(f"Saving original model to: {original_model_dir}")
original_model.save_pretrained(original_model_dir, safe_serialization=True, max_shard_size="3.5GB")
tokenizer.save_pretrained(original_model_dir)

# Get original model size
original_size = sum(os.path.getsize(os.path.join(original_model_dir, f)) 
                   for f in os.listdir(original_model_dir) 
                   if os.path.isfile(os.path.join(original_model_dir, f)))

print(f"Original model saved!")
print(f"Original model size: {original_size / (1024**3):.2f} GB")

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Saving original model to: TinyLlama-models/original
Original model saved!
Original model size: 2.05 GB


In [18]:
def QuantizeModelandSaveP(quantization_config, output_dir):
    model = GPTQModel.load(original_model_dir, quantization_config)
    tokenizer = AutoTokenizer.from_pretrained(original_model_dir, use_fast=True)
    calibration_dataset = [
        tokenizer(
            "gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
        )
    ]

    model.quantize(calibration_dataset)
    model.save(output_dir)

    return tokenizer

In [9]:
QuantizeModelandSaveP(quantize_config_8, output_dir_8)

[32mINFO[0m  Estimated Quantization BPW (bits per weight): 8.31875 bpw, based on [bits: 8, group_size: 128]
[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     


INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=0 (token='<unk>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_length": 2048,
  "pad_token_id": 0
}

[32mINFO[0m  Kernel: loaded -> `[]`                                                   
[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TorchQuantLinear`      
[33mWARN[0m  Calibration dataset size should be more than 256. Current: 1.            
[33mWARN[0m  The average length of input_ids of calibration_dataset should be greater than 256: actual avg: 31.0.
[32mINFO[0m  Process: progress logs for `gptq` will be streamed to file: `gptq_log_ungrieved_time_07_28_2025_11h_57m_26s.log`
[32mINFO[0m  --------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | process     | layer     | module               | loss           | samples     | damp        | time      | fwd_time     |
[32mINFO[0m  ------------------------------------------

In [19]:
QuantizeModelandSaveP(quantize_config_4, output_dir_4)

[32mINFO[0m  Estimated Quantization BPW (bits per weight): 4.2875 bpw, based on [bits: 4, group_size: 128]


[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     6%
Quantizing mlp.gate_proj in layer     [2 of 21] | 12:20:41 / 3 days, 18:31:40 [3/22] 13.6%

INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=0 (token='<unk>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_length": 2048,
  "pad_token_id": 0
}

[32mINFO[0m  Kernel: loaded -> `[]`                                                   6%
[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TorchQuantLinear`      6%
[33mWARN[0m  Calibration dataset size should be more than 256. Current: 1.            6%
[33mWARN[0m  The average length of input_ids of calibration_dataset should be greater than 256: actual avg: 31.0.
[32mINFO[0m  Process: progress logs for `gptq` will be streamed to file: `gptq_log_cystocolostomy_time_07_29_2025_09h_47m_02s.log`
[32mINFO[0m  --------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | process     | layer     | module               | loss           | samples     | damp        | time      | fwd_time     |
[32mINFO[0m  -------------------------------

LlamaTokenizerFast(name_or_path='TinyLlama-models/original', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [11]:
QuantizeModelandSaveP(quantize_config_2, output_dir_2)

[32mINFO[0m  Estimated Quantization BPW (bits per weight): 2.44375 bpw, based on [bits: 2, group_size: 64]
[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     


INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=0 (token='<unk>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_length": 2048,
  "pad_token_id": 0
}

[32mINFO[0m  Kernel: loaded -> `[]`                                                   
[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TorchQuantLinear`      
[33mWARN[0m  Calibration dataset size should be more than 256. Current: 1.            
[33mWARN[0m  The average length of input_ids of calibration_dataset should be greater than 256: actual avg: 31.0.
[32mINFO[0m  Process: progress logs for `gptq` will be streamed to file: `gptq_log_ultrainvolved_time_07_28_2025_12h_12m_38s.log`
[32mINFO[0m  --------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | process     | layer     | module               | loss           | samples     | damp        | time      | fwd_time     |
[32mINFO[0m  --------------------------------------

In [14]:
from gptqmodel import get_best_device

# load quantized model to the first GPU
device = get_best_device()
model = GPTQModel.load(output_dir_4, device=device)
tokenizer = AutoTokenizer.from_pretrained(original_model_dir, use_fast=True)
print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))

from_quantized: adapter: None
[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     
[32mINFO[0m  Estimated Quantization BPW (bits per weight): 4.2875 bpw, based on [bits: 4, group_size: 128]
Quantizing mlp.gate_proj in layer     [2 of 21] | 0:09:10 / 1:07:13 [3/22] 13.6%

[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`             
[32mINFO[0m  Kernel: candidates -> `[TorchQuantLinear]`                               
[32mINFO[0m  Kernel: selected -> `TorchQuantLinear`.                                  
[32mINFO[0m  Format: Converting `checkpoint_format` from `gptq` to internal `gptq_v2`.
[32mINFO[0m  Format: Conversion complete: 0.015610933303833008s                       
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`             
Quantizing mlp.gate_proj in layer     [2 of 21] | 0:09:13 / 1:07:35 [3/22] 13.6%

INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=0 (token='<unk>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

[32mINFO[0m  Model: Auto-fixed `generation_config` mismatch between model and `generation_config.json`.
[32mINFO[0m  Model: Updated `generation_config`: GenerationConfig {3/22] 13.6%
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_length": 2048,
  "pad_token_id": 0
}

[32mINFO[0m  Kernel: loaded -> `[TorchQuantLinear]`                                   
Quantizing mlp.gate_proj in layer     [2 of 21] | 0:09:13 / 1:07:35 [3/22] 13.6%

KeyboardInterrupt: 