In [3]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "facebook/opt-2.7b"
quant_model_dir = "models/opt-2.7b-awq"

quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}

In [4]:
# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 42313.28it/s]


OSError: Error no file named model.safetensors found in directory /root/.cache/huggingface/hub/models--facebook--opt-2.7b/snapshots/905a4b602cda5c501f1b3a2650a4152680238254.

In [6]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "facebook/opt-2.7b"
quant_model_dir = "models/opt-2.7b-awq"

quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM",
    "force_download": True  # 强制重新下载
}

In [8]:
# 加载模型
# local_files_only=True 优先使用本地文件
model = AutoAWQForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, local_files_only=True)

Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 51228.14it/s]


OSError: Error no file named model.safetensors found in directory /root/.cache/huggingface/hub/models--facebook--opt-2.7b/snapshots/905a4b602cda5c501f1b3a2650a4152680238254.

In [1]:
# 改用facebook/opt-125m 轻量模型
from transformers import pipeline

model_path = "facebook/opt-125m"

# 使用 GPU 加载原始的 OPT-125m 模型
generator = pipeline('text-generation',
                     model=model_path,
                     device=0,
                     do_sample=True,
                     num_return_sequences=3)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
generator("The woman worked as a")

[{'generated_text': 'The woman worked as a store front/broker. I was just at the store doing the food'},
 {'generated_text': 'The woman worked as a social worker for a company for 11 years and her family lives alone in rural'},
 {'generated_text': 'The woman worked as a massage therapist for a couple of years; this explains why she was able to'}]

In [3]:
generator("The man worked as a")

[{'generated_text': "The man worked as a chef in New York City's Chinatown and is credited as the architect of the"},
 {'generated_text': 'The man worked as a police officer on a large city’s east side and was assigned to'},
 {'generated_text': "The man worked as a trucker at the local dump. It doesn't matter who he worked for"}]

In [4]:
# 使用AutoAWQ量化模型
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer


quant_path = "models/opt-125m-awq"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}

# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 21.49it/s]


In [5]:
# 量化模型
model.quantize(tokenizer, quant_config=quant_config)

Downloading readme: 100%|██████████| 167/167 [00:00<00:00, 362kB/s]
Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 471M/471M [00:37<00:00, 12.4MB/s] 
Generating validation split: 214670 examples [00:05, 38444.73 examples/s]
AWQ: 100%|██████████| 12/12 [01:34<00:00,  7.84s/it]


In [6]:
quant_config

{'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}

In [7]:
# 使用 Transformers.AwqConfig 来实例化量化模型配置
from transformers import AwqConfig, AutoConfig

# 修改配置文件以使其与transformers集成兼容
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# 预训练的transformers模型存储在model属性中，我们需要传递一个字典
model.model.config.quantization_config = quantization_config

In [8]:
# 保存模型权重
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)  # 保存分词器

('models/opt-125m-awq/tokenizer_config.json',
 'models/opt-125m-awq/special_tokens_map.json',
 'models/opt-125m-awq/vocab.json',
 'models/opt-125m-awq/merges.txt',
 'models/opt-125m-awq/added_tokens.json',
 'models/opt-125m-awq/tokenizer.json')

In [9]:
# 使用 GPU 加载量化模型
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(quant_path)
model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="cuda").to(0)

In [10]:
def generate_text(text):
    inputs = tokenizer(text, return_tensors="pt").to(0)

    out = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [11]:
result = generate_text("Merry Christmas! I'm glad to")
print(result)

Merry Christmas! I'm glad to go from this! I'm excited you. I'm from this is my glad you, is happy all.  I'm going to do me...  This is me here a a good you!
I believe this a a I'm so. I'm so, oh I do


In [12]:
result = generate_text("The woman worked as a")
print(result)

The woman worked as a "girl that he is, but he has is is. to her, her man, has his.


And a. is of, does she

, do a,


In [None]:
# 由于资源有限，T4显卡16GB显存，无法使用AWQ 量化 Facebook OPT-6.7B 模型。以上使用的 Facebook OPT-125m 模型。