In [None]:
# 安装依赖（如环境已具备可跳过）
%pip install -q "transformers>=4.54.0,<=4.57.3" accelerate llmcompressor datasets

# llmcompressor 内部已经集成 GPTQ / AWQ 等算法，一般无需手动安装 auto-gptq / awq 等库。

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor import oneshot

# 基础配置：Qwen2.5-7B 指令模型
base_model_id = "Qwen/Qwen2.5-1.5B-Instruct"

# 设备配置（优先使用 GPU）
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
'''使用 llmcompressor 对 Qwen2.5-7B-Instruct 做 GPTQ 量化'''

# 量化后模型输出目录（可根据需要调整路径）
gptq_out_dir = "models/qwen2.5-1.5b-instruct-gptq-llmc"

gptq_recipe = [
    GPTQModifier(
        scheme="W4A16",      # 权重 4bit，激活保持 16bit
        targets="Linear",    # 只量化线性层
        ignore=["lm_head"],  # 通常不量化输出头
    ),
]

oneshot(
    model=base_model_id,
    dataset="open_platypus",      # 内置公开数据集，方便快速演示
    recipe=gptq_recipe,
    output_dir=gptq_out_dir,
    max_seq_length=2048,
    num_calibration_samples=128,   # 为了速度，这里只取少量校准样本
)

gptq_out_dir

In [None]:
# 加载 GPTQ 量化后的检查点做推理

gptq_tokenizer = AutoTokenizer.from_pretrained(gptq_out_dir, trust_remote_code=True)
if gptq_tokenizer.pad_token_id is None:
    gptq_tokenizer.pad_token = gptq_tokenizer.eos_token

gptq_model = AutoModelForCausalLM.from_pretrained(
    gptq_out_dir,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
gptq_model.eval()

gptq_tokenizer.pad_token, gptq_tokenizer.eos_token, gptq_tokenizer.pad_token_id, gptq_tokenizer.eos_token_id

In [None]:
# 检查 GPTQ 量化是否生效：查看第 0 层 self_attn.q_proj 的类型和配置
layer = gptq_model.model.layers[0].self_attn.q_proj
print("GPTQ q_proj layer type:", type(layer))
print("GPTQ quantization_config:", getattr(gptq_model.config, "quantization_config", None))

In [None]:
'''GPTQ 量化模型对话推理示例'''

GPTQ_TEST_QUERIES = [
    "用两三句话解释一下什么是量子计算？",
    "Give me a brief introduction to large language models in English.",
]

@torch.no_grad()
def gptq_chat(question: str) -> str:
    msgs = [
        {"role": "system", "content": "你是一名 AI 助手，回答准确、简洁。"},
        {"role": "user", "content": question},
    ]
    input_ids = gptq_tokenizer.apply_chat_template(
        msgs,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(gptq_model.device)

    gen_ids = gptq_model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        eos_token_id=gptq_tokenizer.eos_token_id,
        pad_token_id=gptq_tokenizer.pad_token_id,
    )
    out_ids = gen_ids[0, input_ids.shape[-1]:]
    return gptq_tokenizer.decode(out_ids, skip_special_tokens=True).strip()

for q in GPTQ_TEST_QUERIES:
    ans = gptq_chat(q)
    print(f"[GPTQ] Q: {q}\nA: {ans}\n" + "-" * 60)

In [None]:
# 释放 GPTQ 量化模型占用的显存
del gptq_model
torch.cuda.empty_cache()

In [None]:
from llmcompressor.modifiers.awq import AWQModifier
# 使用 llmcompressor 对 Qwen2.5-7B-Instruct 做 AWQ 量化

awq_out_dir = "models/qwen2.5-1.5b-instruct-awq-llmc"

awq_recipe = [
    AWQModifier(
        scheme="W4A16",
        targets="Linear",
        ignore=["lm_head"],
    ),
]

oneshot(
    model=base_model_id,
    dataset="open_platypus",
    recipe=awq_recipe,
    output_dir=awq_out_dir,
    max_seq_length=2048,
    num_calibration_samples=128,
)

awq_out_dir

In [None]:
# 加载 AWQ 量化后的检查点并做对话推理

awq_tokenizer = AutoTokenizer.from_pretrained(awq_out_dir, trust_remote_code=True)
if awq_tokenizer.pad_token_id is None:
    awq_tokenizer.pad_token = awq_tokenizer.eos_token

awq_model = AutoModelForCausalLM.from_pretrained(
    awq_out_dir,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
awq_model.eval()

In [None]:
# 检查 AWQ 量化是否生效：查看第 0 层 self_attn.q_proj 的类型和配置
layer = awq_model.model.layers[0].self_attn.q_proj
print("AWQ q_proj layer type:", type(layer))
print("AWQ quantization_config:", getattr(awq_model.config, "quantization_config", None))

In [None]:
AWQ_TEST_QUERIES = [
    "简单说说大模型量化有什么好处？",
    "Explain in English why activation-aware weight quantization (AWQ) can help LLMs.",
]

@torch.no_grad()
def awq_chat(question: str) -> str:
    msgs = [
        {"role": "system", "content": "你是一名 AI 助手，回答准确、简洁。"},
        {"role": "user", "content": question},
    ]
    input_ids = awq_tokenizer.apply_chat_template(
        msgs,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(awq_model.device)

    gen_ids = awq_model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        eos_token_id=awq_tokenizer.eos_token_id,
        pad_token_id=awq_tokenizer.pad_token_id,
    )
    out_ids = gen_ids[0, input_ids.shape[-1]:]
    return awq_tokenizer.decode(out_ids, skip_special_tokens=True).strip()

for q in AWQ_TEST_QUERIES:
    ans = awq_chat(q)
    print(f"[AWQ] Q: {q}\nA: {ans}\n" + "-" * 60)
