# 1.环境准备

In [1]:
from hulk_nlp import HulkSft

# 初始化代理
HulkSft.initProxy()
HulkSft.getSysInfo()

torch: 2.2.2+cu118
torchvision: 0.17.2+cu118
PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.3.2 (Git Hash 2dc95a2ad0841e29db8b22fbccaf3e5da7992b01)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 11.8
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_90,code=sm_90
  - CuDNN 8.7
  - Magma 2.6.1
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fab

# 2.使用AWQ量化模型(使用默认数据集)

## 量化前测试

In [2]:
from transformers import pipeline

model_path = "facebook/opt-125m"

# 使用 GPU 加载原始的 OPT-125m 模型
generator = pipeline('text-generation',
                     model=model_path,
                     device=0,
                     do_sample=True,
                     num_return_sequences=3)

# 观察点1：GPU显存占用峰值
!nvidia-smi

Mon Apr  8 17:22:07 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:67:03.0 Off |                    0 |
| N/A   34C    P0    37W / 250W |  16650MiB / 32768MiB |     11%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# 观察点2：
generator("The woman worked as a")

[{'generated_text': "The woman worked as a clerk at a school and the school's food department was there with a man"},
 {'generated_text': 'The woman worked as a salesperson at a hospital for 10 years and spent her last paycheck on vacations'},
 {'generated_text': "The woman worked as a school bus driver at that school.  She didn't know what she wanted"}]

In [4]:
generator("The man worked as a")

[{'generated_text': 'The man worked as a security guard for the federal public school board, and said he did not know'},
 {'generated_text': 'The man worked as a maintenance man in the car\nHe worked as a maintenance man in the car'},
 {'generated_text': 'The man worked as a manager for the airport and had been asked to leave in January of 2001 but'}]

## 量化配置

In [5]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

quant_path = "model/opt-2.7b-awq"
quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}
quant_config

{'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}

## 逐层量化

In [4]:
# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 量化模型
model.quantize(tokenizer, quant_config=quant_config)

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.79k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/167 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 471M/471M [00:32<00:00, 14.7MB/s] 


Generating validation split: 0 examples [00:00, ? examples/s]

AWQ: 100%|██████████| 32/32 [10:41<00:00, 20.04s/it]


In [6]:
# Transformers兼容性配置
from transformers import AwqConfig, AutoConfig

# 修改配置文件以使其与transformers集成兼容
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# 预训练的transformers模型存储在model属性中，我们需要传递一个字典
model.model.config.quantization_config = quantization_config

## 保存量化模型

In [7]:
# 保存模型权重
model.save_quantized(quant_path)
# 保存分词器
tokenizer.save_pretrained(quant_path)  

('model/opt-2.7b-awq/tokenizer_config.json',
 'model/opt-2.7b-awq/special_tokens_map.json',
 'model/opt-2.7b-awq/vocab.json',
 'model/opt-2.7b-awq/merges.txt',
 'model/opt-2.7b-awq/added_tokens.json',
 'model/opt-2.7b-awq/tokenizer.json')

## 检查正确性

### 观察点1：量化模型

In [8]:
model.eval()

OptAWQForCausalLM(
  (model): OPTForCausalLM(
    (model): OPTModel(
      (decoder): OPTDecoder(
        (embed_tokens): Embedding(50272, 2560, padding_idx=1)
        (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
        (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-31): 32 x OPTDecoderLayer(
            (self_attn): OPTAttention(
              (k_proj): WQLinear_GEMM(in_features=2560, out_features=2560, bias=True, w_bit=4, group_size=128)
              (v_proj): WQLinear_GEMM(in_features=2560, out_features=2560, bias=True, w_bit=4, group_size=128)
              (q_proj): WQLinear_GEMM(in_features=2560, out_features=2560, bias=True, w_bit=4, group_size=128)
              (out_proj): WQLinear_GEMM(in_features=2560, out_features=2560, bias=True, w_bit=4, group_size=128)
            )
            (activation_fn): ReLU()
            (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affin

### 观察点2：GPU显存占用峰值

In [9]:
!nvidia-smi

Mon Apr  8 12:11:13 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:67:03.0 Off |                    0 |
| N/A   38C    P0    38W / 250W |  10168MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### 观察点3：使用GPU加载模型并生成文本

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(quant_path)
model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="cuda").to(0)

def generate_text(text):
    inputs = tokenizer(text, return_tensors="pt").to(0)
    out = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(out[0], skip_special_tokens=True)

result = generate_text("Merry Christmas! I'm glad to")
print(result)

Merry Christmas! I'm glad to see you are doing well and have that beautiful smile!
Thank you!


In [12]:
result = generate_text("The woman worked as a")
print(result)

The woman worked as a maid in my apartment for four months during a time when I was away on tours several of years ago. Before they met, I called her to tell her that she couldn't come in to clean. She did not even attempt to clean the apartment because I never sent the key codes I have listed. I've asked them
