## Phi-1.5 Multi-bit GPTQ Quantization - Jupyter Notebook
### This notebook will save the original model locally and create 8-bit, 4-bit, and 2-bit quantized versions

In [1]:
import torch
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
import logging
import os
import json
from datetime import datetime
import shutil

  from .autonotebook import tqdm as notebook_tqdm



[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          


In [4]:
# Quantisation environment set up
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.7.1+cu128
CUDA available: True
GPU: Quadro RTX 3000





### Model configuration


In [3]:
model_id = "microsoft/phi-1_5"
base_output_dir = "phi-1_5-models"
original_model_dir = f"{base_output_dir}/original"
output_dir_8= f"{base_output_dir}/phi-1_5-gptqmodel-8bit"
output_dir_4= f"{base_output_dir}/phi-1_5-gptqmodel-4bit"
output_dir_2= f"{base_output_dir}/phi-1_5-gptqmodel-2bit"

# Quantization configurations
quantize_config_8 = QuantizeConfig(
        bits=8,  # quantize model to 8-bit
        group_size=128,  # it is recommended to set the value to 128
    )

quantize_config_4 = QuantizeConfig(
        bits=4,  # quantize model to 4-bit
        group_size=128,  # it is recommended to set the value to 128
    )

quantize_config_2 = QuantizeConfig(
        bits=2,  # quantize model to 2-bit
        group_size=64,  # it is recommended to set the value to 128
    )

print(f"Configuration:")
print(f"Source model: {model_id}")
print(f"Base output directory: {base_output_dir}")
print(f"Q8 output directory: {output_dir_8}")
print(f"Q4 output directory: {output_dir_4}")
print(f"Q2 output directory: {output_dir_2}")

# Create output directories
os.makedirs(base_output_dir, exist_ok=True)
os.makedirs(output_dir_8, exist_ok=True)
os.makedirs(output_dir_4, exist_ok=True)
os.makedirs(output_dir_2, exist_ok=True)


Configuration:
Source model: microsoft/phi-1_5
Base output directory: phi-1_5-models
Q8 output directory: phi-1_5-models/phi-1_5-gptqmodel-8bit
Q4 output directory: phi-1_5-models/phi-1_5-gptqmodel-4bit
Q2 output directory: phi-1_5-models/phi-1_5-gptqmodel-2bit


In [None]:
#Load un-quantized model, by default, the model will always be loaded into CPU memory
model = GPTQModel.load(original_model_dir, quantize_config_8)


[32mINFO[0m  Estimated Quantization BPW (bits per weight): 8.31875 bpw, based on [bits: 8, group_size: 128]
[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     


INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=50256 (token='<|endoftext|>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {}
                  
[32mINFO[0m  Kernel: loaded -> `[]`                                                   


In [14]:
# Get original model size
original_size = sum(os.path.getsize(os.path.join(original_model_dir, f)) 
                   for f in os.listdir(original_model_dir) 
                   if os.path.isfile(os.path.join(original_model_dir, f)))

print(f"Original model size: {original_size / (1024**3):.2f} GB")

Original model size: 2.65 GB


In [16]:
    tokenizer = AutoTokenizer.from_pretrained(original_model_dir, use_fast=True)
    calibration_dataset = [
        tokenizer(
            "gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
        )
    ]

In [None]:

    # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
    model.quantize(calibration_dataset)


[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TorchQuantLinear`      
[33mWARN[0m  Calibration dataset size should be more than 256. Current: 1.            
[33mWARN[0m  The average length of input_ids of calibration_dataset should be greater than 256: actual avg: 29.0.


  return t.to(


[32mINFO[0m  Process: progress logs for `gptq` will be streamed to file: `gptq_log_garapato_time_07_27_2025_17h_04m_08s.log`
[32mINFO[0m  --------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | process     | layer     | module               | loss           | samples     | damp        | time      | fwd_time     |
[32mINFO[0m  --------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | gptq        | 0         | self_attn.q_proj     | [92m0.01595473[0m | 1           | 0.01000     | 2.191     | 0.522        | 
[32mINFO[0m  --------------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | gptq        | 0         | self_attn.k_proj     | [92m0.01693969[0m | 1           | 0.01000     | 1.723     | 0.005        | 
[32mINFO[0m  ------------

NameError: name 'quantized_model_id' is not defined

In [18]:

    # save quantized model
    model.save(output_dir_8)

[32mINFO[0m  Format: Converting GPTQ v2 to v1                                         
[32mINFO[0m  Saved Quantize Config: 
{
  "bits": 8,
  "group_size": 128,
  "desc_act": true,
  "sym": true,
  "lm_head": false,
  "quant_method": "gptq",
  "checkpoint_format": "gptq",
  "pack_dtype": "int32",
  "meta": {
    "quantizer": [
      "gptqmodel:2.2.0"
    ],
    "uri": "https://github.com/modelcloud/gptqmodel",
    "damp_percent": 0.01,
    "damp_auto_increment": 0.0025,
    "static_groups": false,
    "true_sequential": true,
    "mse": 0.0
  }
}
Files in directory:
config.json
generation_config.json
quantize_config.json
quant_log.csv
Content of saved `generation_config.json`:
{
    "_from_model_config": true,
    "transformers_version": "4.53.3"
}
Content of saved `config.json`:
{
    "architectures": [
        "PhiForCausalLM"
    ],
    "attention_dropout": 0.0,
    "bos_token_id": null,
    "embd_pdrop": 0.0,
    "eos_token_id": null,
    "hidden_act": "gelu_new",
    "hidden_si

In [None]:
def QuantizeModelandSaveP(quantization_config, output_dir):
    model = GPTQModel.load(original_model_dir, quantization_config)
    tokenizer = AutoTokenizer.from_pretrained(original_model_dir, use_fast=True)
    calibration_dataset = [
        tokenizer(
            "gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
        )
    ]

    model.quantize(calibration_dataset)
    model.save(output_dir)

In [20]:
QuantizeModelandSaveP(quantize_config_4, output_dir_4)

[32mINFO[0m  Estimated Quantization BPW (bits per weight): 4.2875 bpw, based on [bits: 4, group_size: 128]
[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     


INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=50256 (token='<|endoftext|>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {}
                  
[32mINFO[0m  Kernel: loaded -> `[]`                                                   
[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TorchQuantLinear`      
[33mWARN[0m  Calibration dataset size should be more than 256. Current: 1.            
[33mWARN[0m  The average length of input_ids of calibration_dataset should be greater than 256: actual avg: 29.0.
[32mINFO[0m  Process: progress logs for `gptq` will be streamed to file: `gptq_log_titulus_time_07_27_2025_17h_39m_21s.log`
[32mINFO[0m  --------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | process     | layer     | module               | loss           | samples     | damp        | time      | fwd_time     |
[32mINFO[0m  ---------------------------------------------------------------------------------------------------------------

In [21]:
QuantizeModelandSaveP(quantize_config_2, output_dir_2)

[32mINFO[0m  Estimated Quantization BPW (bits per weight): 2.44375 bpw, based on [bits: 2, group_size: 64]
[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     


INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=50256 (token='<|endoftext|>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {}
                  
[32mINFO[0m  Kernel: loaded -> `[]`                                                   
[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TorchQuantLinear`      
[33mWARN[0m  Calibration dataset size should be more than 256. Current: 1.            
[33mWARN[0m  The average length of input_ids of calibration_dataset should be greater than 256: actual avg: 29.0.
[32mINFO[0m  Process: progress logs for `gptq` will be streamed to file: `gptq_log_undegraded_time_07_27_2025_17h_49m_06s.log`
[32mINFO[0m  ----------------------------------------------------------------------------------------------------------------------------
[32mINFO[0m  | process     | layer     | module               | loss             | samples     | damp        | time      | fwd_time     |
[32mINFO[0m  --------------------------------------------------------------------------------------------------------

In [20]:
from gptqmodel import get_best_device

# load quantized model to the first GPU
device = get_best_device()
model = GPTQModel.load(quantized_model_id, device=device)
print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))

from_quantized: adapter: None
[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     
[32mINFO[0m  Estimated Quantization BPW (bits per weight): 2.44375 bpw, based on [bits: 2, group_size: 64]
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`             
[32mINFO[0m  Kernel: candidates -> `[TorchQuantLinear]`                               
[32mINFO[0m  Kernel: selected -> `TorchQuantLinear`.                                  
[32mINFO[0m  Format: Converting `checkpoint_format` from `gptq` to internal `gptq_v2`.
[32mINFO[0m  Format: Conversion complete: 0.011798620223999023s                       
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`             


INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=50256 (token='<|endoftext|>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {}
                  
[32mINFO[0m  Kernel: loaded -> `[TorchQuantLinear]`                                   


TritonMissing: Cannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at: https://github.com/triton-lang/triton

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


In [23]:
import os
import torch

# Set debug flags
os.environ['TORCHDYNAMO_VERBOSE'] = '1'
os.environ['TORCH_LOGS'] = '+dynamo'

# Force CPU usage
os.environ['CUDA_VISIBLE_DEVICES'] = ''
torch.cuda.is_available = lambda: False  # Force PyTorch to think no GPU is available

# Load model and ensure it's on CPU
quantized_model_id = output_dir_2
model = GPTQModel.load(quantized_model_id)
model = model.cpu()  # Explicitly move model to CPU

# Prepare inputs on CPU
inputs = tokenizer("gptqmodel is", return_tensors="pt")
# Ensure inputs are on CPU (they should be by default now)

# Generate on CPU
with torch.no_grad():
    output = model.generate(**inputs, max_length=50, do_sample=False)
    
print(tokenizer.decode(output[0], skip_special_tokens=True))

from_quantized: adapter: None
[32mINFO[0m  Loader: Auto dtype (native float16): `torch.float16`                     
[32mINFO[0m  Estimated Quantization BPW (bits per weight): 2.44375 bpw, based on [bits: 2, group_size: 64]


[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`             
[32mINFO[0m  Kernel: candidates -> `[TorchQuantLinear]`                               
[32mINFO[0m  Kernel: selected -> `TorchQuantLinear`.                                  
[32mINFO[0m  Format: Converting `checkpoint_format` from `gptq` to internal `gptq_v2`.
[32mINFO[0m  Format: Conversion complete: 0.011531829833984375s                       
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`             


INFO:tokenicer.tokenicer:Tokenicer: Auto fixed pad_token_id=50256 (token='<|endoftext|>').


[32mINFO[0m  Model: Loaded `generation_config`: GenerationConfig {}
                  
[32mINFO[0m  Kernel: loaded -> `[TorchQuantLinear]`                                   


TorchRuntimeError: Dynamo failed to run FX node with fake tensors: call_function <built-in method bitwise_right_shift of type object at 0x00007FF823D0C450>(*(FakeTensor(..., size=(32, 128, 16), dtype=torch.int32), FakeTensor(..., device='cuda:0', size=(1, 1, 16), dtype=torch.int32)), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.bitwise_right_shift.Tensor, found two different devices cpu, cuda:0')

from user code:
   File "c:\Users\marwa\miniconda3\Lib\site-packages\gptqmodel\nn_modules\qlinear\__init__.py", line 441, in dequantize_weight
    zeros = t.bitwise_right_shift(

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
