# 6 【案例1】语言模型量化

实现使用transformers下载模型并进行量化。

## 6.1 下载模型

In [None]:
import torch
from transformers import AutoModelForCausalLM,AutoTokenizer,pipeline
import util.quant_tool as quant_tool
import copy

# 下载模型
model_id = "Salesforce/codegen-350M-mono"
model = AutoModelForCausalLM.from_pretrained(model_id,torch_dtype=torch.bfloat16,low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
print("模型量化前:",model)
model_memory = model.get_memory_footprint()
print("Footprint of the model in MBs: ", 
      model_memory/1e+6)

模型量化前: CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=True)


## 6.2 对称量化

In [None]:
# 对模型进行对称量化
base_model = copy.deepcopy(model)
q_model = quant_tool.quantize_model_weights(model=base_model,per_channel=False,is_symmetric=True,modules_to_exclude=['lm_head'])

In [5]:
print("模型量化后:",q_model)
q_model_memory = q_model.get_memory_footprint()
print("Footprint of the model in MBs: ", 
      q_model_memory/1e+6)

模型量化后: CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): QuantLinear()
          (out_proj): QuantLinear()
        )
        (mlp): CodeGenMLP(
          (fc_in): QuantLinear()
          (fc_out): QuantLinear()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=True)
)
Footprint of the model in MBs:  545.653056


In [6]:
pipe = pipeline("text-generation",model=model,tokenizer=tokenizer)
q_pipe = pipeline("text-generation",model=q_model,tokenizer=tokenizer)

# 将量化前的模型封装进管道
print("量化前模型输出：")
print(pipe("def hello_word():",max_new_tokens=20,do_sample=False))

# 将量化后的模型封装进管道
print("\n量化后模型输出：")
print(q_pipe("def hello_word():",max_new_tokens=20,do_sample=False))

# # 量化前后模型的差距并不大，通过增加输出的token量可以看到当输出的误差积累到一定程度的时候就会出现差异
# print(pipe("def hello_word():",max_new_tokens=60,do_sample=False))
# print(q_pipe("def hello_word():",max_new_tokens=60,do_sample=False))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


量化前模型输出：


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'def hello_word():\n    print("Hello World!")\n\nhello_word()\n\n# �'}]

量化后模型输出：
[{'generated_text': 'def hello_word():\n    print("Hello World!")\n\nhello_word()\n\n# �'}]


## 6.3 非对称量化
来试试非对称量化

In [14]:
# 深拷贝模型
base_model = copy.deepcopy(model)
# 进行非对称量化
q_asym_model = quant_tool.quantize_model_weights(model=base_model,per_channel=False,is_symmetric=False,modules_to_exclude=['lm_head']) # 设置is_symmetric=False启动非对称量化
q_asym_pipe = pipeline("text-generation",model=q_asym_model,tokenizer=tokenizer)
print(q_asym_pipe("def hello_word():",max_new_tokens=20,do_sample=False))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'def hello_word():emet alphabeticalsesianwa gramm imports autspeciales\nو Manhattan thanReineReused'}]


In [15]:
print("模型非对称量化后:",q_asym_model)
q_asym_model_memory = q_asym_model.get_memory_footprint()
print("Footprint of the model in MBs: ", 
      q_asym_model_memory/1e+6)

模型非对称量化后: CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): QuantLinear()
          (out_proj): QuantLinear()
        )
        (mlp): CodeGenMLP(
          (fc_in): QuantLinear()
          (fc_out): QuantLinear()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=True)
)
Footprint of the model in MBs:  545.653056
