In [None]:
!pip install accelerate optimum transformers
!pip install autoawq

In [None]:
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM
import torch

from huggingface_hub import login
login(token="your_hugging_face_token")


In [None]:
model_path = "meta-llama/Llama-3.2-1B"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit":4}


In [None]:
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

In [None]:
model.quantize(tokenizer, quant_config=quant_config)


In [None]:
quant_path = "LLama-3.2-awq"


In [None]:
from transformers import AwqConfig, AutoConfig
from huggingface_hub import HfApi

# modify the config file so that it is compatible with transformers integration
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
).to_dict()

# the pretrained transformers model is stored in the model attribute + we need to pass a dict
model.model.config.quantization_config = quantization_config
# a second solution would be to use Autoconfig and push to hub (what we do at llm-awq)


# save model weights
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

In [None]:
! huggingface-cli login

In [None]:
from huggingface_hub import HfApi
username = "your_username"
MODEL_NAME = quant_path
api = HfApi(token="your_hugging_face_token")


api.create_repo(
    repo_id = f"{username}/{MODEL_NAME}",
    repo_type="model"
)

api.upload_folder(
    repo_id = f"{username}/{MODEL_NAME}",
    folder_path="/content/LLama-3.2-awq",
)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("{username}/LLama-3.2-awq")
model = AutoModelForCausalLM.from_pretrained("{username}/LLama-3.2-awq").to(0)

In [None]:
print(f"size: {model.get_memory_footprint():,} bytes")


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
def generate_text(model, input_text, max_length=50):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output = model.generate(inputs=input_ids,
                            max_length=max_length,
                            do_sample=True,
                            top_k=30,
                            pad_token_id=tokenizer.eos_token_id,
                            attention_mask=input_ids.new_ones(input_ids.shape))
    return tokenizer.decode(output[0], skip_special_tokens=True)

def calculate_perplexity(model, text):
    # Encode the text
    encodings = tokenizer(text, return_tensors='pt').to(device)

    # Define input_ids and target_ids
    input_ids = encodings.input_ids
    target_ids = input_ids.clone()

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

    # Loss calculation
    neg_log_likelihood = outputs.loss

    # Perplexity calculation
    ppl = torch.exp(neg_log_likelihood)

    return ppl

In [None]:

text = generate_text(model, "I have a dream")
print(f" model:\n{text}")

In [None]:
perplexity = calculate_perplexity(model, text)
print(f"perplexity (model): {perplexity.item():.2f}")