# GPTQ

In [None]:
# Cell 1: Install Dependencies
!pip install gptqmodel
!pip install optimum

In [None]:
# Cell 2: Imports and Setup
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AwqConfig, GPTQConfig
import os
from huggingface_hub import login
from datasets import load_dataset
import subprocess

# Enable CUDA debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

# Set your Hugging Face token
HF_TOKEN = ""
login(HF_TOKEN)

# Model and configuration
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
OUTPUT_DIR = "/kaggle/working/quantized_model"
REPO_ID = "msaadg/Llama-3.2-3B-quantized"
DEVICE_MAP = "auto"
TORCH_DTYPE = torch.bfloat16  # Use bfloat16 for non-quantized weights where supported

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

In [None]:
# Helper function for calibration dataset
def get_calibration_dataset(dataset_name="allenai/c4", split="train", num_samples=100):
    try:
        dataset = load_dataset(dataset_name, "en", split=split, streaming=True, trust_remote_code=True)
        samples = [next(iter(dataset))["text"] for _ in range(num_samples)]
        return samples
    except Exception as e:
        print(f"Error in dataset preparation: {e}")
        raise

# Cell 5: GPTQ Quantization (4-bit)
def quantize_gptq():
    print("Quantizing with GPTQ (4-bit)...")

    # Load calibration dataset subset
    dataset = get_calibration_dataset(dataset_name="allenai/c4", num_samples=100)

    quantization_config = GPTQConfig(
        bits=4,
        dataset=dataset,
        tokenizer=tokenizer,
    )
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=quantization_config,
            device_map=DEVICE_MAP,
            torch_dtype=TORCH_DTYPE
        )
        # Save locally
        model.save_pretrained(os.path.join(OUTPUT_DIR, "llama3.2-3b-gptq-4bit"))
        tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "llama3.2-3b-gptq-4bit"))
        # Push to Hugging Face Hub
        model.push_to_hub(f"{REPO_ID}-gptq-4bit")
        tokenizer.push_to_hub(f"{REPO_ID}-gptq-4bit")
        print(f"GPTQ quantized model saved to {os.path.join(OUTPUT_DIR, 'llama3.2-3b-gptq-4bit')} and pushed to {REPO_ID}-gptq-4bit")
        return model
    except Exception as e:
        print(f"Error in GPTQ quantization: {e}")
        raise

In [None]:
# Cell 7: Main Execution
def main():
    if QUANT_METHOD == "bnb":
        model = quantize_bnb()
    elif QUANT_METHOD == "gptq":
        model = quantize_gptq()
    elif QUANT_METHOD == "awq":
        model = quantize_awq()
    else:
        raise ValueError(f"Invalid QUANT_METHOD: {QUANT_METHOD}. Choose from 'bnb', 'awq', 'gptq'.")
    
    # Print memory footprint
    # print(f"Model memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")
    return model

In [None]:
QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_sparse_dsnot_neo2"
REPO_ID = "msaadg/llama_3b_sparse_dsnot_neo2-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_sparse_neo"
REPO_ID = "msaadg/llama_3b_sparse_neo-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_wanda_neo"
REPO_ID = "msaadg/llama_3b_wanda_neo-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_wanda_dsnot_neo"
REPO_ID = "msaadg/llama_3b_wanda_dsnot_neo-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_pruned_neo"
REPO_ID = "msaadg/llama_3b_pruned_neo-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_sparse_dsnot_mini"
REPO_ID = "msaadg/llama_3b_sparse_dsnot_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_sparse_mini"
REPO_ID = "msaadg/llama_3b_sparse_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_wanda_mini"
REPO_ID = "msaadg/llama_3b_wanda_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_wanda_dsnot_mini"
REPO_ID = "msaadg/llama_3b_wanda_dsnot_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_pruned_mini"
REPO_ID = "msaadg/llama_3b_pruned_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "gptq"  # Options: "bnb", "gptq", "awq"
model = main()

# BnB & AWQ

In [None]:
!pip install optimum
!pip install --upgrade transformers accelerate bitsandbytes
!pip install git+https://github.com/casper-hansen/AutoAWQ.git

In [None]:
# Cell 3: BitsAndBytes Quantization (4-bit)
def quantize_bnb():
    print("Quantizing with BitsAndBytes (4-bit)...")
    torch.cuda.empty_cache()
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",  # Normal Float 4 for better training compatibility
        bnb_4bit_compute_dtype=TORCH_DTYPE,
        bnb_4bit_use_double_quant=True  # Nested quantization for extra memory savings
    )
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=quantization_config,
            device_map=DEVICE_MAP,
            torch_dtype=TORCH_DTYPE
        )
        # Save locally
        model.save_pretrained(os.path.join(OUTPUT_DIR, "llama3.2-3b-bnb-4bit"))
        tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "llama3.2-3b-bnb-4bit"))
        # Push to Hugging Face Hub
        model.push_to_hub(f"{REPO_ID}-bnb-4bit")
        tokenizer.push_to_hub(f"{REPO_ID}-bnb-4bit")
        print(f"BNB quantized model saved to {os.path.join(OUTPUT_DIR, 'llama3.2-3b-bnb-4bit')} and pushed to {REPO_ID}-bnb-4bit")
        return model
    except Exception as e:
        print(f"Error in BNB quantization: {e}")
        raise

In [None]:
# Cell 4: AWQ Quantization (4-bit)
def quantize_awq():
    print("Quantizing with AWQ (4-bit)...")
    torch.cuda.empty_cache()
    from awq import AutoAWQForCausalLM
    from datasets import load_dataset
    from huggingface_hub import upload_folder, create_repo
    try:
        # Load calibration dataset
        dataset_path = "carlosejimenez/wikitext__wikitext-2-raw-v1"
        data = load_dataset(dataset_path, split="train")
        calib_data = [text for text in data["text"] if text.strip() != '' and len(text.split(' ')) > 20]

        # Quantization configuration for AutoAWQ
        quant_config = {
            "zero_point": True,
            "q_group_size": 128,
            "w_bit": 4,
            "version": "GEMM"
        }
        model = AutoAWQForCausalLM.from_pretrained(
            MODEL_ID,
            safetensors=True,
            device_map=DEVICE_MAP,
            torch_dtype=torch.float16
        )
        model.quantize(tokenizer, quant_config=quant_config, calib_data=calib_data)
        
        # Save locally with sharding
        quant_path = os.path.join(OUTPUT_DIR, "llama3.2-3b-awq-4bit")
        model.save_quantized(quant_path, safetensors=True, shard_size="4GB")
        tokenizer.save_pretrained(quant_path)
        
        # Create repository on Hugging Face Hub if it doesn't exist
        create_repo(repo_id=f"{REPO_ID}-awq-4bit", repo_type="model", token=HF_TOKEN, exist_ok=True)
        
        # Push to Hugging Face Hub
        upload_folder(
            folder_path=quant_path,
            repo_id=f"{REPO_ID}-awq-4bit",
            repo_type="model",
            token=HF_TOKEN
        )
        print(f"AWQ quantized model saved to {quant_path} and pushed to {REPO_ID}-awq-4bit")
        return model
    except Exception as e:
        print(f"Error in AWQ quantization: {e}")
        raise

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_sparse_dsnot_neo2"
REPO_ID = "msaadg/llama_3b_sparse_dsnot_neo2-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_sparse_neo"
REPO_ID = "msaadg/llama_3b_sparse_neo-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_wanda_neo"
REPO_ID = "msaadg/llama_3b_wanda_neo-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_wanda_dsnot_neo"
REPO_ID = "msaadg/llama_3b_wanda_dsnot_neo-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_pruned_neo"
REPO_ID = "msaadg/llama_3b_pruned_neo-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_sparse_dsnot_mini"
REPO_ID = "msaadg/llama_3b_sparse_dsnot_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_sparse_mini"
REPO_ID = "msaadg/llama_3b_sparse_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_wanda_mini"
REPO_ID = "msaadg/llama_3b_wanda_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_wanda_dsnot_mini"
REPO_ID = "msaadg/llama_3b_wanda_dsnot_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
# Clear Memory
torch.cuda.empty_cache()

# Model and configuration
MODEL_ID = "musab1blaser/llama_3b_pruned_mini"
REPO_ID = "msaadg/llama_3b_pruned_mini-quantized"

# Load tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

QUANT_METHOD = "bnb"  # Options: "bnb", "gptq", "awq"
model = main()

QUANT_METHOD = "awq"  # Options: "bnb", "gptq", "awq"
model = main()

In [None]:
torch.cuda.empty_cache()