In [None]:
!pip install transformers==4.40.1 accelerate==0.30.0 bitsandbytes==0.43.1 auto-gptq==0.7.1 autoawq==0.2.5 optimum==1.19.1 -qqq

# Reducing model size with quantization

## BitsandBytes

In [2]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# 8-bit quantization model
bnb_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=bnb_config_8bit)

# 4-bit quantization model
bnb_config_4bit = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4')
model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=bnb_config_4bit, low_cpu_mem_usage=True)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


## GPTQ

**GPTQ quantization code**

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantization_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)

2024-11-18 05:06:29.287394: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-18 05:06:29.377696: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-18 05:06:29.381533: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-11-18 05:06:29.381543: I tensorflow/stream_executor/cuda

Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]



**Load quantized GPTQ model**

In [7]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-beta-GPTQ",
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## AWQ

**Load quantized AWQ model**

In [7]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

In [9]:
model_name_or_path = "TheBloke/zephyr-7B-beta-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True, trust_remote_code=False, safetensors=True)

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

quant_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

train_results.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

all_results.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

eval_results.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]


Replacing layers...:   0%|                                                                                                                                                                                            | 0/32 [00:00<?, ?it/s][A
Replacing layers...:   3%|█████▋                                                                                                                                                                              | 1/32 [00:00<00:08,  3.66it/s][A
Replacing layers...:   6%|███████████▎                                                                                                                                                                        | 2/32 [00:00<00:07,  4.10it/s][A
Replacing layers...:   9%|████████████████▉                                                                                                                                                                   | 3/32 [00:00<00:07,  3.99it/s][A
Replacing layers...:  12%|█████████