# Quantize Saudi-Judge to AWQ for vLLM

This notebook quantizes your 14B Qwen3 model to AWQ 4-bit format using **llm-compressor** (the official vLLM quantization tool).

**Requirements:**
- RunPod GPU Pod with A100 (40GB+)  
- Hugging Face account with write access

**Runtime:** ~30-45 minutes

In [None]:
# Optional: upgrade pip if needed
!pip install --upgrade pip -q

In [None]:
# Step 1: Install llm-compressor (official vLLM quantization tool)
# Uses system torch - no need to reinstall

%pip install llmcompressor -q
%pip install transformers accelerate huggingface_hub datasets hf_transfer -q

print("✅ Dependencies installed!")
print("\n⚠️  RESTART THE RUNTIME NOW before continuing!")

In [None]:
# Step 1b: Verify installation (run AFTER restarting runtime)
import torch
import transformers
import llmcompressor

print(f"torch: {torch.__version__}")
print(f"transformers: {transformers.__version__}")
print(f"llmcompressor: {llmcompressor.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print("\n✅ Ready to quantize!")

In [None]:
# Step 2: Login to Hugging Face
from huggingface_hub import login

# Get your token from: https://huggingface.co/settings/tokens
HF_TOKEN = ""  # <-- PASTE YOUR TOKEN
login(token=HF_TOKEN)

In [None]:
# Step 3: Prepare calibration dataset
from datasets import load_dataset
from transformers import AutoTokenizer

MODEL_ID = "Aljalajil/Saudi-Judge-Merged-16bit"
OUTPUT_DIR = "Saudi-Judge-AWQ"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Load calibration dataset (256 samples is good for AWQ)
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512

ds = load_dataset("HuggingFaceH4/ultrachat_200k", split=f"train_sft[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)

def preprocess(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}

ds = ds.map(preprocess)

print(f"✅ Loaded {len(ds)} calibration samples")

In [None]:
# Step 4: Load model
from transformers import AutoModelForCausalLM

print(f"Loading model: {MODEL_ID}")
print("This takes 5-10 minutes for a 14B model...")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    dtype="auto",
    trust_remote_code=True
)

print("✅ Model loaded!")

In [None]:
# Step 5: Quantize to AWQ 4-bit using llm-compressor
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier

# AWQ recipe: 4-bit weights, 16-bit activations, asymmetric
recipe = [
    AWQModifier(
        ignore=["lm_head"],
        scheme="W4A16_ASYM",
        targets=["Linear"],
        duo_scaling="both"
    ),
]

print("Starting AWQ quantization...")
print("This takes 20-40 minutes. Please be patient!")

oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

print("✅ Quantization complete!")

In [None]:
# Step 6: Push directly to Hugging Face Hub
REPO_NAME = "Aljalajil/Saudi-Judge-AWQ"

print(f"Pushing model to {REPO_NAME}...")
model.push_to_hub(REPO_NAME, save_compressed=True, private=True)
tokenizer.push_to_hub(REPO_NAME, private=True)

print(f"""
========================================
✅ QUANTIZATION & UPLOAD COMPLETE!

Your model is now available at:
https://huggingface.co/{REPO_NAME}

Next steps:
1. Go to RunPod Serverless
2. Edit your vLLM endpoint
3. Set model to: {REPO_NAME}  
4. Save and test!

Don't forget to DELETE this GPU Pod!
========================================
""")

In [None]:
# (Optional) Step 7: Save locally if needed
# Uncomment if you want a local copy:
# model.save_pretrained(OUTPUT_DIR, save_compressed=True)
# tokenizer.save_pretrained(OUTPUT_DIR)
# print(f"Saved to {OUTPUT_DIR}/")

In [None]:
# Step 8: Test the quantized model with vLLM (streaming)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

MODEL_ID = "Aljalajil/Saudi-Judge-AWQ"

# Load tokenizer for chat template
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Initialize vLLM engine
print(f"Loading vLLM model: {MODEL_ID}")
print("This may take a few minutes...")
llm = LLM(
    model=MODEL_ID,
    trust_remote_code=True,
    max_model_len=8192,  # Limit context length to fit in GPU memory
    gpu_memory_utilization=0.90,
)
print("✅ vLLM model loaded!")

# Test prompt (Arabic legal query)
test_prompt = "ما هي عقوبة السرقة في النظام السعودي؟"

messages = [
    {"role": "system", "content": "أنت قاضٍ سعودي متخصص في الأنظمة والقوانين السعودية."},
    {"role": "user", "content": test_prompt}
]

# Apply chat template to get the formatted prompt
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

print(f"\nTesting: {test_prompt}\n")
print("-" * 50)
print("Response: ", end="")

# Generate with vLLM
sampling_params = SamplingParams(
    temperature=0.7,
    max_tokens=512,
    stop=["<|im_end|>", "<|endoftext|>", "<|im_start|>"],
    stop_token_ids=[151645, 151643],
)

# Generate (vLLM doesn't support streaming in generate(), use batch generation)
outputs = llm.generate([prompt], sampling_params)
print(outputs[0].outputs[0].text)

print("\n✅ Test complete!")