### Runing LLaMA-3-8B W3A3 quantized model

#### Download the prebuilt quantized model:
We have provide the prebuilt quantized model on Huggingface. In order to download the large weights, we'll have to use git lfs.

In [None]:
! conda install git git-lfs
! git lfs install

# download LLaMA-3-8b-w3a3 quantization
! git clone git clone https://huggingface.co/FRM-PTQ/Llama-3-8b-w3a3-frm-ptq

In [1]:
import os
os.environ["http_proxy"] = "http://localhost:7890"
os.environ["https_proxy"] = "http://localhost:7890"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from accelerate import infer_auto_device_map, dispatch_model
import torch
from datautils import get_loaders, test_ppl

@torch.no_grad()
def evaluate(model, tokenizer):
    '''
    Note: evaluation simply move model to single GPU. 
    Therefor, to evaluate large model such as Llama-2-70B on single A100-80GB,
    please activate '--real_quant'.
    '''
    # import pdb;pdb.set_trace()
    block_class_name = model.model.layers[0].__class__.__name__
    device_map = infer_auto_device_map(model, max_memory={i: '40GB' for i in range(torch.cuda.device_count())}, no_split_module_classes=[block_class_name])
    model = dispatch_model(model, device_map=device_map)
    results = {}

    datasets = ["wikitext2", "c4"]
    ppl_results = test_ppl(model, tokenizer, datasets, 2048)
    for dataset in ppl_results:
        print(f'{dataset} perplexity: {ppl_results[dataset]:.2f}')
    return results

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from quantize.int_linear_real import load_quantized_model
from accelerate import infer_auto_device_map, dispatch_model
import torch

model_path = './Llama-3-8b-w3a3g128-frm-ptq'
wbits = 3
abits = 3
group_size = 128
use_act_quant = True
sensitive_group = [10, 15, 16, 8, 13, 31, 1, 0]
robust_group = [23, 24, 22, 25, 26, 3]
model, tokenizer = load_quantized_model(model_path=model_path, wbits=wbits, abits=abits,group_size=group_size, use_act_quant=use_act_quant, sensitive_group=sensitive_group, robust_group=robust_group)
print(f"memory footprint after loading quantized model: {torch.cuda.max_memory_allocated('cuda') / 1024**3:.2f}GiB")

# Test PPL
evaluate(model, tokenizer)

Loading quantized model from /root/code_z/FRM/output/block_ap_models/Llama-3-8b-w3a3g128


100%|██████████| 32/32 [00:00<00:00, 86.50it/s]


Loading pre-computed quantized weights...
Loading pre-computed quantized weights Successfully
memory footprint after loading quantized model: 4.78GiB
get_wikitext2


100%|██████████| 141/141 [01:01<00:00,  2.29it/s]


wikitext2:10.751235961914062
get_c4


100%|██████████| 256/256 [01:43<00:00,  2.48it/s]

c4:15.501581192016602
wikitext2 perplexity: 10.75
c4 perplexity: 15.50





{}

In [4]:
# Test Zero_shot
import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval.utils import make_table
eval_tasks = 'piqa,arc_easy,arc_challenge,hellaswag,boolq,winogrande,mmlu'
task_list = eval_tasks.split(',')
model = HFLM(pretrained=model, batch_size=8)
task_manager = lm_eval.tasks.TaskManager()
results = lm_eval.simple_evaluate(
        model=model,
        tasks=task_list,
        num_fewshot=0,
        task_manager=task_manager,
        )
print(make_table(results))
total_acc = 0
for task in task_list:
    total_acc += results['results'][task]['acc,none']
print(f'Average Acc: {total_acc/len(task_list)*100:.2f}%')

2025-05-09:17:31:40,092 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing th

|                 Tasks                 |Version|Filter|n-shot| Metric |Value |   |Stderr|
|---------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
|mmlu                                   |N/A    |none  |     0|acc     |0.3631|±  |0.0040|
| - humanities                          |N/A    |none  |     0|acc     |0.3403|±  |0.0068|
|  - formal_logic                       |      0|none  |     0|acc     |0.2937|±  |0.0407|
|  - high_school_european_history       |      0|none  |     0|acc     |0.4061|±  |0.0383|
|  - high_school_us_history             |      0|none  |     0|acc     |0.4363|±  |0.0348|
|  - high_school_world_history          |      0|none  |     0|acc     |0.5021|±  |0.0325|
|  - international_law                  |      0|none  |     0|acc     |0.3884|±  |0.0445|
|  - jurisprudence                      |      0|none  |     0|acc     |0.3796|±  |0.0469|
|  - logical_fallacies                  |      0|none  |     0|acc     |0.3374|±  |0.0371|