In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [10]:
# Cell 1: Setup with proper GPU usage
import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

pretrained_model_dir = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_dir = "tinyllama-4bit-128g"

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Quadro RTX 3000


In [11]:
# Cell 2: Prepare examples and config
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)

examples = [
    tokenizer("TinyLlama is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm.")
]

quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    desc_act=False,
)

print("✅ Setup completed")

✅ Setup completed


In [16]:
# Cell 3: Load model (let it use GPU)
print("Loading model on GPU...")

model = AutoGPTQForCausalLM.from_pretrained(
    pretrained_model_dir, 
    quantize_config

)

model.cuda() 
print("✅ Model loaded successfully")

Loading model on GPU...




✅ Model loaded successfully


In [17]:
# Cell 4: Quantize with full CPU enforcement
print("Starting quantization process (CPU only)...")
print("⏳ This will take several minutes...")

# Set environment to force CPU usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Hide GPU from PyTorch

model.quantize(examples)

print("✅ Quantization completed!")

INFO - Start quantizing layer 1/22
  attn_output = torch.nn.functional.scaled_dot_product_attention(
INFO - Quantizing self_attn.k_proj in layer 1/22...


Starting quantization process (CPU only)...
⏳ This will take several minutes...


INFO - Quantizing self_attn.v_proj in layer 1/22...
INFO - Quantizing self_attn.q_proj in layer 1/22...
INFO - Quantizing self_attn.o_proj in layer 1/22...
INFO - Quantizing mlp.up_proj in layer 1/22...
INFO - Quantizing mlp.gate_proj in layer 1/22...
INFO - Quantizing mlp.down_proj in layer 1/22...
INFO - Start quantizing layer 2/22
INFO - Quantizing self_attn.k_proj in layer 2/22...
INFO - Quantizing self_attn.v_proj in layer 2/22...
INFO - Quantizing self_attn.q_proj in layer 2/22...
INFO - Quantizing self_attn.o_proj in layer 2/22...
INFO - Quantizing mlp.up_proj in layer 2/22...
INFO - Quantizing mlp.gate_proj in layer 2/22...
INFO - Quantizing mlp.down_proj in layer 2/22...
INFO - Start quantizing layer 3/22
INFO - Quantizing self_attn.k_proj in layer 3/22...
INFO - Quantizing self_attn.v_proj in layer 3/22...
INFO - Quantizing self_attn.q_proj in layer 3/22...
INFO - Quantizing self_attn.o_proj in layer 3/22...
INFO - Quantizing mlp.up_proj in layer 3/22...
INFO - Quantizing mlp

✅ Quantization completed!


In [19]:
# Cell 5: Save the quantized model
print("Saving quantized model...")

# Save quantized model using safetensors (more compatible)
model.save_quantized(quantized_model_dir, use_safetensors=True)

print(f"✅ Quantized model saved to: {quantized_model_dir}")

# Display saved files
import os
if os.path.exists(quantized_model_dir):
    files = os.listdir(quantized_model_dir)
    print(f"📁 Files in {quantized_model_dir}:")
    for file in files:
        print(f"  - {file}")

Saving quantized model...
✅ Quantized model saved to: tinyllama-4bit-128g
📁 Files in tinyllama-4bit-128g:
  - config.json
  - gptq_model-4bit-128g.safetensors
  - quantize_config.json


In [21]:
# Cell 6: Test the quantized model
print("Loading quantized model for testing...")

# Load quantized model (use "cpu" for Jetson compatibility)
model = AutoGPTQForCausalLM.from_quantized(
    quantized_model_dir, 
    use_safetensors=True
)

print("✅ Quantized model loaded for testing")

INFO - The layer lm_head is not quantized.


Loading quantized model for testing...
✅ Quantized model loaded for testing


In [22]:
# Cell 7: Test inference
print("Testing inference...")

# Test inference with model.generate
test_input = tokenizer("TinyLlama is", return_tensors="pt").to(model.device)
output = model.generate(**test_input, max_length=30, do_sample=False, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("🔍 Generated text:", generated_text)

Testing inference...
🔍 Generated text: TinyLlama is a small, furry, and cuddly lamb that is perfect for kids to hold and c


In [23]:
tokenizer.save_pretrained(quantized_model_dir)

('tinyllama-4bit-128g\\tokenizer_config.json',
 'tinyllama-4bit-128g\\special_tokens_map.json',
 'tinyllama-4bit-128g\\tokenizer.model',
 'tinyllama-4bit-128g\\added_tokens.json',
 'tinyllama-4bit-128g\\tokenizer.json')