In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig



In [13]:
# Cell 1: Setup with proper GPU usage
import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

pretrained_model_dir = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_dir = "tinyllama-8bit-128g"

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Quadro RTX 3000


In [14]:
# Cell 2: Prepare examples and config
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)

examples = [
    tokenizer("TinyLlama is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."),
    
    # Edge device deployment scenarios (TinyLlama's main use case)
    tokenizer("Translate this to Spanish without internet connection:"),
    tokenizer("Generate a quick response for mobile app:"),
    tokenizer("Process this text on a low-power device:"),
    
    # Speculative decoding assistance (key TinyLlama use case)
    tokenizer("Complete this sequence efficiently:"),
    tokenizer("Predict the next tokens for: The weather today"),
    tokenizer("Draft continuation for: Machine learning is"),
    
    # Chat dialogue generation (validated use case)
    tokenizer("Generate dialog between two people about the weather:"),
    tokenizer("Create a conversation starter for a mobile chat app:"),
    tokenizer("Write dialogue for a simple chatbot response:"),
    
    
    # IoT and embedded systems (TinyLlama target use case)
    tokenizer("Generate status report for smart device:"),
    tokenizer("Create simple voice assistant response:"),
    tokenizer("Process sensor data description:"),
    
    # Educational content (suitable for TinyLlama size)
    tokenizer("Explain basic math concept for students:"),
    tokenizer("Create a simple learning tip:"),
    tokenizer("Write educational content for mobile learning:"),
]

quantize_config = BaseQuantizeConfig(
    
    bits=8,
    group_size=64,        # Smaller groups often better for 8-bit
    desc_act=True,        # Enable activation ordering for better accuracy
    damp_percent=0.01,    # Fine-tune damping (if available)
    true_sequential=True, # Sequential quantization (if supported)
)

print("✅ Setup completed")

✅ Setup completed


In [15]:
# Cell 3: Load model (let it use GPU)
print("Loading model on GPU...")

model = AutoGPTQForCausalLM.from_pretrained(
    pretrained_model_dir, 
    quantize_config

)

model.cuda() # This line allows the model to be loaded fully without having some tensors on the CPU
print("✅ Model loaded successfully")

Loading model on GPU...
✅ Model loaded successfully


In [16]:
# Cell 4: Quantize with full CPU enforcement
print("Starting quantization process (CPU only)...")
print("⏳ This will take several minutes...")

model.quantize(examples)

print("✅ Quantization completed!")

INFO - Start quantizing layer 1/22
INFO - Quantizing self_attn.k_proj in layer 1/22...


Starting quantization process (CPU only)...
⏳ This will take several minutes...


INFO - Quantizing self_attn.v_proj in layer 1/22...
INFO - Quantizing self_attn.q_proj in layer 1/22...
INFO - Quantizing self_attn.o_proj in layer 1/22...
INFO - Quantizing mlp.up_proj in layer 1/22...
INFO - Quantizing mlp.gate_proj in layer 1/22...
INFO - Quantizing mlp.down_proj in layer 1/22...
INFO - Start quantizing layer 2/22
INFO - Quantizing self_attn.k_proj in layer 2/22...
INFO - Quantizing self_attn.v_proj in layer 2/22...
INFO - Quantizing self_attn.q_proj in layer 2/22...
INFO - Quantizing self_attn.o_proj in layer 2/22...
INFO - Quantizing mlp.up_proj in layer 2/22...
INFO - Quantizing mlp.gate_proj in layer 2/22...
INFO - Quantizing mlp.down_proj in layer 2/22...
INFO - Start quantizing layer 3/22
INFO - Quantizing self_attn.k_proj in layer 3/22...
INFO - Quantizing self_attn.v_proj in layer 3/22...
INFO - Quantizing self_attn.q_proj in layer 3/22...
INFO - Quantizing self_attn.o_proj in layer 3/22...
INFO - Quantizing mlp.up_proj in layer 3/22...
INFO - Quantizing mlp

✅ Quantization completed!


In [17]:
# Cell 5: Save the quantized model
print("Saving quantized model...")

# Save quantized model using safetensors (more compatible)
model.save_quantized(quantized_model_dir, use_safetensors=True)

print(f"✅ Quantized model saved to: {quantized_model_dir}")

# Display saved files
import os
if os.path.exists(quantized_model_dir):
    files = os.listdir(quantized_model_dir)
    print(f"📁 Files in {quantized_model_dir}:")
    for file in files:
        print(f"  - {file}")

Saving quantized model...
✅ Quantized model saved to: tinyllama-8bit-128g
📁 Files in tinyllama-8bit-128g:
  - config.json
  - gptq_model-8bit-128g.safetensors
  - gptq_model-8bit-64g.safetensors
  - quantize_config.json
  - special_tokens_map.json
  - tokenizer.json
  - tokenizer.model
  - tokenizer_config.json


In [18]:
# Cell 6: Test the quantized model
print("Loading quantized model for testing...")

# Load quantized model (use "cpu" for Jetson compatibility)
model = AutoGPTQForCausalLM.from_quantized(
    quantized_model_dir, 
    use_safetensors=True
)

print("✅ Quantized model loaded for testing")

INFO - The layer lm_head is not quantized.


Loading quantized model for testing...
✅ Quantized model loaded for testing


In [19]:
# Cell 7: Test inference
print("Testing inference...")

# Test inference with model.generate
test_input = tokenizer("What is an ice cream ", return_tensors="pt").to(model.device)
output = model.generate(**test_input, max_length=30, do_sample=False, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("🔍 Generated text:", generated_text)

Testing inference...
🔍 Generated text: What is an ice cream 


In [20]:
from transformers import  TextGenerationPipeline

pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("auto-gptq is")[0]["generated_text"])

The model 'LlamaGPTQForCausalLM' is not supported for . Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForC

auto-gptq is


In [21]:
tokenizer.save_pretrained(quantized_model_dir)

('tinyllama-8bit-128g\\tokenizer_config.json',
 'tinyllama-8bit-128g\\special_tokens_map.json',
 'tinyllama-8bit-128g\\tokenizer.model',
 'tinyllama-8bit-128g\\added_tokens.json',
 'tinyllama-8bit-128g\\tokenizer.json')