# Requirements

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


**Requirements**

Requires: Transformers 4.33.0 or later, Optimum 1.12.0 or later, and AutoGPTQ 0.4.2 or later.

```shell
pip3 install --upgrade transformers optimum
# If using PyTorch 2.1 + CUDA 12.x:
pip3 install --upgrade auto-gptq
# or, if using PyTorch 2.1 + CUDA 11.x:
pip3 install --upgrade auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
```

If you are using PyTorch 2.0, you will need to install AutoGPTQ from source. Likewise if you have problems with the pre-built wheels, you should try building from source:

```shell
pip3 uninstall -y auto-gptq
git clone https://github.com/PanQiWei/AutoGPTQ
cd AutoGPTQ
git checkout v0.5.1
pip3 install .
```


In [None]:
!pip install --upgrade transformers optimum
# If using PyTorch 2.1 + CUDA 12.x:
!pip install --upgrade auto-gptq
# or, if using PyTorch 2.1 + CUDA 11.x:
# !pip install --upgrade auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

Check last available versions of the Auto-GPTQ here: https://github.com/AutoGPTQ/AutoGPTQ/blob/main/docs/INSTALLATION.md

In [None]:
from typing import Any
import random
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import os
def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
    if split == "train":
        data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"})
    elif split == "validation":
        data = load_dataset(
            "allenai/c4",
            split="validation",
            data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
        )
    dataset = []
    for _ in range(nsamples):
        while True:

            i = random.randint(0, len(data) - 1)
            enc = tokenizer(data[i]["text"], return_tensors="pt")
            if enc.input_ids.shape[1] >= seqlen:
                break
        if enc.input_ids.shape[1] - seqlen - 1 >0:
            i = random.randint(0, enc.input_ids.shape[1] - seqlen - 1)
            j = i + seqlen
            inp = enc.input_ids[:, i:j]
            attention_mask = torch.ones_like(inp)
            dataset.append({"input_ids": inp, "attention_mask": attention_mask})
    return dataset

In [None]:
#@title Choose a model for quantization
pretrained_model_dir = "facebook/opt-125m" #@param str
!echo proceed with model: {pretrained_model_dir}

In [None]:
#@title Enter the desired bit precision (n-bit) for quantization (e.g., 2,3,4,8):
n_bits = 4 #@param int

# Quantization

In [None]:
model_name=pretrained_model_dir.split("/")[-1]
n_bits=str(n_bits)
quantized_model_dir = f"{model_name}-{n_bits}bit"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples=get_c4(tokenizer=tokenizer, seqlen=2048, nsamples=128, split="train")

quantize_config = BaseQuantizeConfig(
    bits=n_bits,
    group_size=128,
    desc_act=False,
)

# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(examples)
model.save_quantized(quantized_model_dir, use_safetensors=True)

# **Note**: By default, the format of the model file base name saved using Auto-GPTQ is: gptq_model-{bits}bit-{group_size}g.
# To support further loading with the automatic transformers class AutoForCausalLM, rename the file as below to model.safetensors.
matching_file_weights=None
for filename in os.listdir(quantized_model_dir):
    if filename.endswith('.safetensors') and filename != 'model.safetensors':
        matching_file_weights.append(filename)
if matching_file_weights:
    new_model_file = f'{quantized_model_dir}/model.safetensors'
    os.rename(matching_file_weights, new_model_file)

# Voilà, now the model can be used for inference
# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")

# inference with model.generate
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))

# or you can also use pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("auto-gptq is")[0]["generated_text"])

# Push Quantized Model to Hugging Face Hub

To use `use_auth_token=True`, log in first via `huggingface-cli login`, or pass an explicit token with: `use_auth_token="hf_xxxxxxx"`.

**Uncomment the following three lines to enable this feature:**

```python
repo_id = f"YourUserName/{quantized_model_dir}"
commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits} bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
```
**Note**: By default, the format of the model file base name saved using Auto-GPTQ is: `gptq_model-{bits}bit-{group_size}g`. To support further loading with the automatic class `AutoForCausalLM`, change it to `model.safetensors`, as suggested above.

```
model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
tokenizer.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
```