# Quantization Method Using AWQ

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install -q --upgrade transformers accelerate huggingface_hub peft fsspec==2025.3.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m118.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m105.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Quantization of TinyLlama using AWQ to int4

In [None]:
import torch

print("🧠 PyTorch version:", torch.__version__)
print("⚙️ CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("🚀 CUDA device name:", torch.cuda.get_device_name(0))
    print("💾 Total GPU memory (MB):", torch.cuda.get_device_properties(0).total_memory // (1024 * 1024))
else:
    print("❌ No CUDA-compatible GPU found or PyTorch not built with CUDA")


🧠 PyTorch version: 2.7.1+cu126
⚙️ CUDA available: True
🚀 CUDA device name: Tesla T4
💾 Total GPU memory (MB): 15095


In [3]:
!pip install -q autoawq

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/74.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for autoawq (setup.py) ... [?25l[?25hdone


In [4]:
import torch

print("🧠 PyTorch version:", torch.__version__)
print("⚙️ CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("🚀 CUDA device name:", torch.cuda.get_device_name(0))
    print("💾 Total GPU memory (MB):", torch.cuda.get_device_properties(0).total_memory // (1024 * 1024))
else:
    print("❌ No CUDA-compatible GPU found or PyTorch not built with CUDA")

🧠 PyTorch version: 2.6.0+cu124
⚙️ CUDA available: True
🚀 CUDA device name: Tesla T4
💾 Total GPU memory (MB): 15095


In [None]:
!mkdir -p /content/drive/MyDrive/llm_quant_awq

In [6]:
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM

# Model path
model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
save_dir = "/content/drive/MyDrive/llm_quant_awq/TinyLlama-awq-int4"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoAWQForCausalLM.from_pretrained(model_path)

# AWQ quantization config
quant_config = {
    "w_bit": 4,
    "q_group_size": 128,
    "zero_point": True,
    "version": "GEMM"
}

# Quantize and save
print("🚀 Quantizing to 4-bit (AWQ)...")
model.quantize(tokenizer=tokenizer, quant_config=quant_config)
model.save_quantized(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"✅ Quantized model saved to: {save_dir}")

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

🚀 Quantizing to 4-bit (AWQ)...


Repo card metadata block was not found. Setting CardData to empty.
Token indices sequence length is longer than the specified maximum sequence length for this model (8322 > 2048). Running this sequence through the model will result in indexing errors
AWQ: 100%|██████████| 22/22 [19:26<00:00, 53.03s/it]


✅ Quantized model saved to: /content/drive/MyDrive/llm_quant_awq/TinyLlama-awq-int4_1


# Testing TinyLlma - 1.1 B AWQ-int4 model Chatbot

In [1]:
import time
import math
import psutil
import os
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM

# Configuration
QUANT_DIR = "/content/drive/MyDrive/llm_quant_awq/TinyLlama-awq-int4"

def load_quantized_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"🔧 Using device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(QUANT_DIR)
    model = AutoAWQForCausalLM.from_quantized(
        QUANT_DIR,
        device_map="auto",
        trust_remote_code=True
    )
    model.eval()
    return tokenizer, model, device

def get_cpu_ram_usage():
    process = psutil.Process(os.getpid())
    ram = process.memory_info().rss / (1024 * 1024)
    cpu = process.cpu_percent(interval=0.1)
    return round(cpu, 2), round(ram, 2)

def get_gpu_ram_usage():
    if torch.cuda.is_available():
        used = torch.cuda.memory_allocated() / (1024 ** 2)
        total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)
        return round(used, 2), round(total, 2)
    return 0, 0

def compute_perplexity(model, tokenizer, prompt, device):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        return math.exp(loss.item())

def compute_entropy_from_scores(scores):
    entropies = []
    for score in scores:
        probs = F.softmax(score, dim=-1)
        log_probs = F.log_softmax(score, dim=-1)
        entropy = -(probs * log_probs).sum(dim=-1)
        entropies.append(entropy.item())
    avg_entropy = sum(entropies) / len(entropies) if entropies else 0.0
    return round(avg_entropy, 4)

def chat_loop(tokenizer, model, device, max_new_tokens=150):
    print("\n🤖 TinyLlama Chatbot (4-bit AWQ) is ready! Type 'exit' to quit.\n")
    print(f"⚡ Quantized model loaded from: {QUANT_DIR}\n")

    while True:
        user_input = input("👤 You: ").strip()
        if user_input.lower() in {"exit", "quit"}:
            print("👋 Exiting chat. Goodbye!")
            break

        messages = [{"role": "user", "content": user_input}]
        prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        # Tokenize and get input length
        inputs = tokenizer(prompt_text, return_tensors="pt").to(device)
        input_token_count = inputs["input_ids"].shape[1]

        # Generation with timing
        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                output_scores=True,
                return_dict_in_generate=True
            )
        duration = time.time() - start_time

        # Calculate output metrics
        generated_ids = outputs.sequences[0][input_token_count:]
        response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
        response_token_count = len(generated_ids)
        response_length_chars = len(response)

        # System metrics
        cpu_usage, ram_usage = get_cpu_ram_usage()
        gpu_used, gpu_total = get_gpu_ram_usage()
        perplexity = compute_perplexity(model, tokenizer, prompt_text, device)
        tokens_per_sec = round(response_token_count / max(duration, 0.001), 2)
        avg_entropy = compute_entropy_from_scores(outputs.scores)

        # Display all metrics
        print(f"\n🤖 Bot: {response}")
        print(f"📏 Input tokens: {input_token_count}")
        print(f"📏 Output tokens: {response_token_count} ({response_length_chars} chars)")
        print(f"⏱ Generation time: {duration:.2f}s ({tokens_per_sec} tokens/sec)")
        print(f"💻 CPU: {cpu_usage}%, RAM: {ram_usage} MB")
        print(f"🎮 GPU: {gpu_used} / {gpu_total} MB (allocated)")
        print(f"📉 Perplexity (prompt): {perplexity:.2f}")
        print(f"🧠 Avg Token Entropy: {avg_entropy}\n")

if __name__ == "__main__":
    tokenizer, model, device = load_quantized_model()
    chat_loop(tokenizer, model, device)

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



🔧 Using device: cuda


Replacing layers...: 100%|██████████| 22/22 [00:21<00:00,  1.01it/s]



🤖 TinyLlama Chatbot (4-bit AWQ) is ready! Type 'exit' to quit.

⚡ Quantized model loaded from: /content/drive/MyDrive/llm_quant_awq/TinyLlama-awq-int4

👤 You: Hi. How are you?

🤖 Bot: I am doing well. How about you?
📏 Input tokens: 23
📏 Output tokens: 10 (31 chars)
⏱ Generation time: 4.81s (2.08 tokens/sec)
💻 CPU: 0.0%, RAM: 2007.03 MB
🎮 GPU: 747.57 / 15095.06 MB (allocated)
📉 Perplexity (prompt): 5.03
🧠 Avg Token Entropy: 0.9351

👤 You: What is your name?

🤖 Bot: My name is Sarah.
📏 Input tokens: 22
📏 Output tokens: 6 (17 chars)
⏱ Generation time: 0.34s (17.63 tokens/sec)
💻 CPU: 0.0%, RAM: 2030.8 MB
🎮 GPU: 747.08 / 15095.06 MB (allocated)
📉 Perplexity (prompt): 4.92
🧠 Avg Token Entropy: 1.413

👤 You: How old are you?

🤖 Bot: I do not have a physical body to answer questions. However, based on the given text, it is implied that the speaker is in their mid-20s or early 30s.
📏 Input tokens: 22
📏 Output tokens: 42 (149 chars)
⏱ Generation time: 2.28s (18.38 tokens/sec)
💻 CPU: 0.0%, RAM: 

This method only supports 4 bit quantization. Also it only supportsDecoder only model quantization. Not Encode-decoder LLMs.