In [None]:
!pip -q install -U autoawq transformers accelerate

In [None]:
import torch
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AutoConfig
import gc
import os

In [None]:
# Configure environment
os.environ["TOKENIZERS_PARALLELISM"] = "false"
base_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
print("Checking PyTorch and CUDA versions...")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name()}")

In [None]:
# Clear any existing cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

In [None]:
print("Loading tokenizer...")
tok = AutoTokenizer.from_pretrained(
    base_model,
    use_fast=True,
    trust_remote_code=True
)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

In [None]:
print("Loading model...")
# Load model with specific configurations to avoid attention issues
mdl = AutoAWQForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.float16,  # Use fp16 explicitly
    device_map={"": 0} if torch.cuda.is_available() else "cpu"  # More specific device mapping
)

In [None]:
# Set model to eval mode
mdl.eval()

In [None]:
# AWQ quantization config
quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}

In [None]:
# Create more diverse and shorter calibration data
print("Preparing calibration data...")
calib_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Machine learning models process data efficiently.",
    "Natural language understanding is advancing rapidly.",
    "Deep neural networks learn complex patterns.",
    "Artificial intelligence transforms technology.",
    "Computer vision recognizes objects accurately.",
    "Robotics integrates sensors and actuators.",
    "Algorithm optimization improves performance significantly.",
    "Data science extracts meaningful insights.",
    "Software engineering creates reliable systems."
] * 10  # 100 samples total

In [None]:
import os
os.environ["PYTORCH_USE_SDPA"] = "0"

In [None]:
from transformers import AutoConfig
cfg = AutoConfig.from_pretrained(base_model, trust_remote_code=True)
cfg.attn_implementation = "eager"   # fallback attention

In [None]:
calib_tokens = [
    tok(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128).input_ids
    for text in calib_texts[:50]
]


In [None]:
mdl.quantize(
    tok,
    quant_config=quant_config,
    calib_data=calib_tokens,
    max_calib_seq_len=128,
    max_calib_samples=50,
    n_parallel_calib_samples=1
)


In [None]:
import os
os.environ["PYTORCH_USE_SDPA"] = "0"   # Disable SDPA attention

calib_tokens = [
    tok(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128).input_ids
    for text in calib_texts[:50]
]

mdl.quantize(
    tok,
    quant_config=quant_config,
    calib_data=calib_tokens,
    max_calib_seq_len=128,
    max_calib_samples=50,
    n_parallel_calib_samples=1
)


In [None]:
calib_tokens = [
    tok(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128).input_ids[0].tolist()
    for text in calib_texts[:50]
]

mdl.quantize(
    tok,
    quant_config=quant_config,
    calib_data=calib_tokens,   # now proper format
    max_calib_seq_len=128,
    max_calib_samples=50,
    n_parallel_calib_samples=1
)


In [None]:
mdl.quantize(
    tok,
    quant_config=quant_config,
    calib_data=calib_texts[:50],   # raw text list
    max_calib_seq_len=128,
    max_calib_samples=50,
    n_parallel_calib_samples=1
)

In [None]:
# Save the quantized model
out_dir = "tinyllama-1.1b-awq"
print(f"Saving model to {out_dir}...")

mdl.save_quantized(out_dir, safetensors=True)
tok.save_pretrained(out_dir)

# Save config
config = AutoConfig.from_pretrained(base_model, trust_remote_code=True)
config.save_pretrained(out_dir)

print(f"Model successfully quantized and saved to {out_dir}")

In [None]:
print("Starting quantization...")
try:
    # Use minimal configuration to avoid batch size issues
    mdl.quantize(
        tok,
        quant_config=quant_config,
        calib_data=calib_texts,
        max_calib_seq_len=128,     # Reduced sequence length
        max_calib_samples=50,      # Reduced sample count
        n_parallel_calib_samples=1 # Keep sequential processing
    )

    print("Quantization completed successfully!")

    # Save the quantized model
    out_dir = "tinyllama-1.1b-awq"
    print(f"Saving model to {out_dir}...")

    mdl.save_quantized(out_dir, safetensors=True)
    tok.save_pretrained(out_dir)

    # Save config
    config = AutoConfig.from_pretrained(base_model, trust_remote_code=True)
    config.save_pretrained(out_dir)

    print(f"Model successfully quantized and saved to {out_dir}")

except Exception as e:
    print(f"Quantization failed with error: {str(e)}")
    print("Trying alternative approach...")
    print("\nTrying alternative approach with different model loading...")

    # Alternative approach: Load model differently
    del mdl
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

    try:
        # Try loading without device_map first
        mdl = AutoAWQForCausalLM.from_pretrained(
            base_model,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
            use_cache=False
        )

        # Move to device manually if CUDA is available
        if torch.cuda.is_available():
            mdl = mdl.cuda()

        mdl.eval()

        # Try with even smaller calibration parameters
        print("Attempting quantization with minimal parameters...")
        mdl.quantize(
            tok,
            quant_config=quant_config,
            calib_data=calib_texts[:20],  # Only use first 20 samples
            max_calib_seq_len=64,         # Even smaller sequence length
            max_calib_samples=20,         # Minimal samples
            n_parallel_calib_samples=1
        )

        out_dir = "tinyllama-1.1b-awq"
        mdl.save_quantized(out_dir, safetensors=True)
        tok.save_pretrained(out_dir)

        config = AutoConfig.from_pretrained(base_model, trust_remote_code=True)
        config.save_pretrained(out_dir)

        print(f"Model successfully quantized with alternative approach and saved to {out_dir}")

    except Exception as e2:
        print(f"Alternative approach also failed: {str(e2)}")
        print("\nConsider using a pre-quantized model instead:")
        print("TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ")

finally:
    # Cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

The script runs an AWQ quantization pipeline with a safe, low-memory fallback path.

Primary flow:

Calls mdl.quantize(...) using your tokenizer, AWQ quant config, and calibration texts (prompts).

Uses conservative settings to avoid OOM: max_calib_seq_len=128, max_calib_samples=50, n_parallel_calib_samples=1.

On success, saves the quantized weights, tokenizer, and model config to out_dir.

If quantization fails:

Frees memory (del, torch.cuda.empty_cache(), gc.collect()).

Reloads the model with low_cpu_mem_usage=True and use_cache=False (and moves to GPU if available).

Retries quantization with smaller calibration settings (shorter seq length, fewer samples).

If it still fails, suggests using a pre-quantized AWQ model.

Role of prompts (calibration texts):

They provide short, representative inputs so AWQ can observe activations and pick good weight scales.

More and more realistic prompts ⇒ better quality after quantization; your code uses a small set to keep memory low.

Important parameters:

max_calib_seq_len & max_calib_samples: control calibration length/size (quality vs. memory).

n_parallel_calib_samples: set to 1 for minimal VRAM use.

low_cpu_mem_usage=True, use_cache=False: reduce RAM/GPU memory footprint during load/quant.

safetensors=True, trust_remote_code=True: safer format and allow custom model code.

Gotchas / prerequisites:

Ensure variables & imports exist: mdl, tok, calib_texts, quant_config, base_model, plus torch, gc, AutoConfig, and AutoAWQForCausalLM (from the AWQ library).

Don’t mix GPTQ configs with AWQ; use the right quant config for the backend.

If you hit OOM, reduce max_calib_seq_len/samples further or increase group size.

After quantization:

Load the saved model directory with the AWQ loader (from_quantized) and run generation as usual.