In [1]:
import time, os, torch
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

save_dir = "/mnt/2T/Codes/models/quantized_model"
os.makedirs(save_dir, exist_ok=True)

model_path = 'meta-llama/Llama-3.1-8B-Instruct'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# List of w_bit values
w_bits = [8]
for w_bit in w_bits:
    quant_config["w_bit"] = w_bit
    # Load model
    model = AutoAWQForCausalLM.from_pretrained(
        model_path
        , trust_remote_code=True
        , device_map="auto"
        # , low_cpu_mem_usage=True
        # , use_cache=False
    )
    # Quantize
    start_time = time.time()
    model.quantize(tokenizer, quant_config=quant_config
            #    , export_compatible=True
               )
    end_time = time.time()
    
    # Save quantized model
    quantized_model_dir = f"{save_dir}/Llama-3.1-8B-Instruct-AWQ-{w_bit}bit"
    # model.save_quantized(quantized_model_dir)
    model.save_quantized(quantized_model_dir, safetensors=False)
    tokenizer.save_pretrained(quantized_model_dir)
    # model.pack() # makes the model CUDA compat
    # model.save_quantized(save_dir + "/Llama-3.1-8B-Instruct-AWQ-4bit", safetensors=False)
    # tokenizer.save_pretrained(save_dir + "/Llama-3.1-8B-Instruct-AWQ-4bit")

    # Export to ONNX
    # onnx_path = save_dir + "/Llama-3.1-8B-Instruct-AWQ-4bit/model.onnx"
    # convert_pytorch_to_onnx(model, onnx_path, opset_version=17)
    
    print(f'Model with w_bit={w_bit} is quantized and saved at "{quantized_model_dir}", time: {end_time - start_time:.2f} seconds')


  from .autonotebook import tqdm as notebook_tqdm
Fetching 16 files: 100%|██████████| 16/16 [00:00<00:00, 27719.48it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.43s/it]
Repo card metadata block was not found. Setting CardData to empty.
AWQ:   0%|          | 0/32 [00:00<?, ?it/s]The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.
AWQ: 100%|██████████| 32/32 [17:03<00:00, 31.98s/it]
Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using split_torch_state_dict_into_shards from huggingface_hub library


Model with w_bit=8 is quantized and saved at "/mnt/2T/Codes/models/quantized_model/Llama-3.1-8B-Instruct-AWQ-8bit", time: 1036.83 seconds


In [2]:
quantized_model_dir = f"{save_dir}/Llama-3.1-8B-Instruct-AWQ-{w_bit}bit-tmp"
model.save_quantized(quantized_model_dir, safetensors=True)
# tokenizer.save_pretrained(quantized_model_dir)


Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using split_torch_state_dict_into_shards from huggingface_hub library
