# 1.环境准备

In [1]:
from hulk_nlp import HulkSft

# 初始化代理
HulkSft.initProxy()
HulkSft.getSysInfo()

torch: 2.2.2+cu118
torchvision: 0.17.2+cu118
PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.3.2 (Git Hash 2dc95a2ad0841e29db8b22fbccaf3e5da7992b01)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 11.8
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_90,code=sm_90
  - CuDNN 8.7
  - Magma 2.6.1
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fab

# 2.使用AWQ量化模型(使用默认数据集)

## 量化前测试

In [3]:
from transformers import pipeline

model_path = "facebook/opt-125m"

# 使用 GPU 加载原始的 OPT-125m 模型
generator = pipeline('text-generation',
                     model=model_path,
                     device=0,
                     do_sample=True,
                     num_return_sequences=3)

# 观察点1：GPU显存占用峰值
!nvidia-smi

Mon Apr  8 12:52:45 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:67:03.0 Off |                    0 |
| N/A   37C    P0    37W / 250W |  15452MiB / 32768MiB |     22%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# 观察点2：
generator("The woman worked as a")

[{'generated_text': 'The woman worked as a manager at a gas station at the time\nThe woman worked at a gas'},
 {'generated_text': 'The woman worked as a manager, but she did not take an salary, because she did not have'},
 {'generated_text': "The woman worked as a waitress at a fast food chain. I'm surprised she can't find a"}]

In [5]:
generator("The man worked as a")

[{'generated_text': 'The man worked as a contractor for a gas company.  He was on vacation.  He decided'},
 {'generated_text': 'The man worked as a consultant for a real estate company and had recently built a successful construction business.'},
 {'generated_text': 'The man worked as a full-time chef for a food company. His restaurant, which is now'}]

## 量化配置

In [6]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

quant_path = "model/opt-125m-awq"
quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}
quant_config

{'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}

## 逐层量化

In [7]:
# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 量化模型
model.quantize(tokenizer, quant_config=quant_config)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.10k [00:00<?, ?B/s]

LICENSE.md:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.
AWQ: 100%|██████████| 12/12 [01:37<00:00,  8.12s/it]


In [8]:
# Transformers兼容性配置
from transformers import AwqConfig, AutoConfig

# 修改配置文件以使其与transformers集成兼容
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# 预训练的transformers模型存储在model属性中，我们需要传递一个字典
model.model.config.quantization_config = quantization_config

## 保存量化模型

In [9]:
# 保存模型权重
model.save_quantized(quant_path)
# 保存分词器
tokenizer.save_pretrained(quant_path)

('model/opt-125m-awq/tokenizer_config.json',
 'model/opt-125m-awq/special_tokens_map.json',
 'model/opt-125m-awq/vocab.json',
 'model/opt-125m-awq/merges.txt',
 'model/opt-125m-awq/added_tokens.json',
 'model/opt-125m-awq/tokenizer.json')

## 检查正确性

### 观察点1：量化模型

In [10]:
model.eval()

OptAWQForCausalLM(
  (model): OPTForCausalLM(
    (model): OPTModel(
      (decoder): OPTDecoder(
        (embed_tokens): Embedding(50272, 768, padding_idx=1)
        (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-11): 12 x OPTDecoderLayer(
            (self_attn): OPTAttention(
              (k_proj): WQLinear_GEMM(in_features=768, out_features=768, bias=True, w_bit=4, group_size=128)
              (v_proj): WQLinear_GEMM(in_features=768, out_features=768, bias=True, w_bit=4, group_size=128)
              (q_proj): WQLinear_GEMM(in_features=768, out_features=768, bias=True, w_bit=4, group_size=128)
              (out_proj): WQLinear_GEMM(in_features=768, out_features=768, bias=True, w_bit=4, group_size=128)
            )
            (activation_fn): ReLU()
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    

### 观察点2：GPU显存占用峰值

In [11]:
!nvidia-smi

Mon Apr  8 12:55:51 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:67:03.0 Off |                    0 |
| N/A   38C    P0    38W / 250W |  15188MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### 观察点3：使用GPU加载模型并生成文本

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(quant_path)
model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="cuda").to(0)

def generate_text(text):
    inputs = tokenizer(text, return_tensors="pt").to(0)
    out = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(out[0], skip_special_tokens=True)

result = generate_text("Merry Christmas! I'm glad to")
print(result)

Merry Christmas! I'm glad to be a bit extra this year than I was this year.
I'm always super happy because I'm also 30. This is my first year on the Internet with a long term goal. I love that we can all really connect. It helps that in this way I get the feeling "Hey, here's everything I


In [13]:
result = generate_text("The woman worked as a")
print(result)

The woman worked as a delivery driver for a week in a car that had a defective brake on the driver seat.  The driver is charged with assault for the incident.  She may face up to three years in prison.
I think an assault charge is appropriate, since her client is clearly an adult.
