In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import random 
import numpy as np
def fix_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
fix_seed(0)
MODEL_DIR = "quantized_models/Llama-3.2-3B-mxfp-w4-a4-RTN-wush" 

In [None]:
 # change to your full path if needed
def pick_device():
    if torch.cuda.is_available():
        return "cuda"
    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        return "mps"
    return "cpu"

device = pick_device()
dtype = torch.float16 if device == "cuda" else torch.float32

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)

# Model
# If this folder is quantized with custom kernels and fails to load, see notes below.
model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    device_map="cuda",
    trust_remote_code=True,  # keep True if the repo/folder has custom modeling code
)
model.eval()

In [None]:
mod = model.model.layers[2].mlp.gate_proj

print("Parameters:")
for name, p in mod.named_parameters(recurse=False):
    print(" ", name, p.shape, p.dtype, p.device)
    # print("    ", p)

print("\nBuffers:")
for name, b in mod.named_buffers(recurse=False):
    print(" ", name, b.shape if hasattr(b, "shape") else type(b), b.dtype if torch.is_tensor(b) else "")
    print("    ", b)

In [None]:
prompt = "What is quantization-aware training?Assistant:"
inputs = tokenizer(prompt, return_tensors="pt")
if device == "cpu":
    pass
elif device == "cuda":
    inputs = {k: v.to(device) for k, v in inputs.items()}
else:
    # CUDA: if device_map="auto" was used, inputs can stay on CPU; HF will dispatch.
    # But moving to CUDA explicitly is also fine if the whole model is on one GPU.
    # We'll keep it simple and leave as-is.
    pass

with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=12,
        do_sample=False,
        temperature=0.8,
        top_p=0.95,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

text = tokenizer.decode(out[0], skip_special_tokens=True)
print(text)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer


def get_wikitext2(tokenizer: AutoTokenizer,  sequence_length: int):
    test_dataset_raw = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    test_dataset_tok = tokenizer("\n\n".join(test_dataset_raw["text"]), return_tensors="pt").input_ids
    num_test_sequences = test_dataset_tok.numel() // sequence_length
    test_loader = []
    for i in range(num_test_sequences):
        test_loader.append(test_dataset_tok[:, i * sequence_length : (i + 1) * sequence_length])
    return test_loader

from tqdm import trange

import torch
import torch.nn.functional as F


@torch.no_grad()
def compute_perplexity(model, data, batch_size: int = 1):
    num_samples = len(data)
    device = next(model.parameters()).device
    # Running estimate of negative log-likelihood
    nll_running = 0
    # Number of tokens processed to far
    tokens_processed = 0
    # Loop through each batch
    for i in trange(0, num_samples, batch_size, desc="Computing perplexity", leave=False):
        j = min(i + batch_size, num_samples)
        inputs = torch.cat(data[i:j]).to(device)
        # Forward pass through the model
        lm_logits = model(inputs).logits
        # Shift logits and labels for next token prediction
        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = inputs[:, 1:]
        # Compute loss
        loss = F.cross_entropy(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
        # Calculate negative log likelihood
        a = shift_labels.numel() / (tokens_processed + shift_labels.numel())
        b = tokens_processed / (tokens_processed + shift_labels.numel())
        nll_running = a * loss + b * nll_running
        # Update number of processed tokens
        tokens_processed += shift_labels.numel()
    # Compute perplexity
    ppl = nll_running.exp().item()
    return ppl


eval_data = get_wikitext2(tokenizer, 2048)
ppl = compute_perplexity(model, eval_data)
print(f"Perplexity: {ppl:.2f}")

In [None]:
from lm_eval.models.huggingface import HFLM
import lm_eval


In [None]:
lm = HFLM(
            pretrained=model, 
            tokenizer=tokenizer, 
            batch_size=64,
            max_length=4096, # from open LLM openllm
        )
task_manager = lm_eval.tasks.TaskManager()
results = []

In [None]:
task_results = lm_eval.simple_evaluate(
                    model=lm,
                    tasks="winogrande",
                    batch_size=64,
                    task_manager=task_manager,
                    # confirm_run_unsafe_code=True,
                )["results"]
results.append(task_results)

In [None]:
task_results = lm_eval.simple_evaluate(
                model=lm,
                tasks="hellaswag",
                num_fewshot=10,
                batch_size= 64,
                task_manager=task_manager,
            )["results"]
results.append(task_results)

In [None]:
task_results = lm_eval.simple_evaluate(
                    model=lm,
                    tasks="mmlu_cot_llama",
                    batch_size=32,
                    apply_chat_template=True,
                    fewshot_as_multiturn=True,
                    task_manager=task_manager,
                )["results"]
results.append(task_results)

In [None]:
task_results = lm_eval.simple_evaluate(
                model=lm,
                tasks="gsm8k_llama",
                batch_size=32,
                apply_chat_template=True,
                fewshot_as_multiturn=True,
                task_manager=task_manager,
            )["results"]
results.update(task_results)