# Quantize a loaded model to 4-bit
This notebook demonstrates loading a causal LM and converting its weights to a 4-bit quant format (e.g., NF4/FP4) using bitsandbytes. Split into multiple cells for clarity.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Gemma3ForConditionalGeneration
import bitsandbytes as bnb
import torch

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_name = "google/gemma-3-4b-it"

# -----------------------------
# Load the model
# -----------------------------
try:
    model = Gemma3ForConditionalGeneration.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        dtype=torch.bfloat16
    )
    print(f"Successfully loaded {model_name} with 4-bit NF4 quantization")
except Exception as e:
    print(f"Error loading model with 4-bit quantization: {e}")
    print("Falling back to loading model without quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.bfloat16
    )
    print("Loaded model without quantization - you can apply custom 4-bit quantization manually.")

# -----------------------------
# Load tokenizer
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Tokenizer for {model_name} loaded successfully.")

In [None]:
def convert_to_4bit(module, dtype="nf4"):
    """
    Replace Linear weights in the model with 4-bit quantized versions (NF4/FP4) using bitsandbytes.
    Non-linear layers are left unchanged. Assumes recent bitsandbytes with Float4Params.
    """
    for name, child in module.named_modules():
        if isinstance(child, torch.nn.Linear):
            try:
                # Quantize weights
                q = bnb.nn.LinearNF4(
                    child.in_features,
                    child.out_features,
                    bias=(child.bias is not None)
                )
                q.weight.data.copy_(child.weight.data)
                if child.bias is not None:
                    q.bias.data.copy_(child.bias.data)
                # Wrap in nn.Parameter if needed
                if not isinstance(q, torch.nn.Parameter):
                    q = torch.nn.Parameter(q)
                # Assign quantized weights back
                child.weight = q
            except Exception as e:
                print(f"Skipping quantization for {name}: {e}")
    return module

In [None]:
quantized_model = convert_to_4bit(model, dtype="nf4")

In [None]:
import os

# -----------------------------
# Save directory (Linux/WSL style)
# -----------------------------
directory = "/mnt/d/Model Folder/modcord_custom_models/gemma-3-4b-it-nf4"
os.makedirs(directory, exist_ok=True)

model_to_save = quantized_model

# -----------------------------
# Save the model and tokenizer
# -----------------------------
model_to_save.save_pretrained(directory)
tokenizer.save_pretrained(directory)

print(f"Model and tokenizer saved to {directory}")

# -----------------------------
# Print model info
# -----------------------------
print("\nModel details:")
print(f"Model name: {model_name}")
# dtype might not exist on the model object if using bitsandbytes
dtype = getattr(model_to_save, "dtype", None)
print(f"Model dtype: {dtype if dtype is not None else 'mixed/4-bit'}")

if hasattr(model_to_save.config, 'quantization_config') and model_to_save.config.quantization_config:
    print(f"Quantization config: {model_to_save.config.quantization_config}")
else:
    print("No quantization config found in model")


### Use Sharding. Completely different approach but hopefully works

In [None]:
import os
import shutil
from safetensors.torch import load_file, save_file
import torch
import gc

# -----------------------------
# CONFIGURATION
# -----------------------------
SRC_DIR = r"/mnt/d/Model Folder/huggingface_model_cache/models--openai--gpt-oss-20b/snapshots/6cee5e81ee83917806bbde320786a8fb61efebee"
OUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/gpt-oss-20b-nf4"

# Create output directory if it doesn't exist
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Copy config and tokenizer files
# -----------------------------
for fname in ["config.json", "generation_config.json", "tokenizer.json", 
              "tokenizer_config.json", "special_tokens_map.json"]:
    src_path = os.path.join(SRC_DIR, fname)
    dst_path = os.path.join(OUT_DIR, fname)
    if os.path.exists(src_path):
        shutil.copy(src_path, dst_path)

# -----------------------------
# Process each shard
# -----------------------------
for fname in os.listdir(SRC_DIR):
    if fname.endswith(".safetensors") and "model" in fname:
        src_shard = os.path.join(SRC_DIR, fname)
        out_shard = os.path.join(OUT_DIR, fname.replace(".safetensors", "-bf16.safetensors"))

        print(f"Processing shard: {fname} → {os.path.basename(out_shard)}")
        # Load shard to GPU
        state_dict = load_file(src_shard, device="cuda")
        
        # Convert FP16 → BF16, leave others intact
        new_state_dict = {}
        for k, v in state_dict.items():
            if v.dtype == torch.float16:
                new_state_dict[k] = v.to(torch.bfloat16)
            else:
                new_state_dict[k] = v
        
        # Save new shard
        save_file(new_state_dict, out_shard)

        # Explicitly free memory
        del state_dict
        del new_state_dict
        gc.collect()
        torch.cuda.empty_cache()  # optional if using GPU

print(f"\nAll shards converted. BF16 model ready at: {OUT_DIR}")


In [None]:
import os
from safetensors.torch import load_file
OUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/qwen3-4b-instruct-nf4"
shard_path = os.path.join(OUT_DIR, "model.safetensors")

state_dict = load_file(shard_path, device="cuda")

for k, v in state_dict.items():
    print(k, v.shape, v.dtype)


In [None]:
from safetensors.torch import load_file, save_file
import os

model_dir = "/mnt/d/Model Folder/modcord_custom_models/qwen3-4b-instruct-nf4"
shard_files = sorted([f for f in os.listdir(model_dir) if f.endswith(".safetensors")])

combined = {}
for shard_file in shard_files:
    shard_path = os.path.join(model_dir, shard_file)
    shard_data = load_file(shard_path)
    combined.update(shard_data)

# Save as one file
save_file(combined, os.path.join(model_dir, "model.safetensors"))


In [1]:
from transformers import AutoProcessor, AutoTokenizer
from pathlib import Path

TARGET_DIR = Path(r"/mnt/d/Model Folder/modcord_custom_models/gemma-3-4b-it-nf4")
TARGET_DIR.mkdir(parents=True, exist_ok=True)

# Processor (multimodal)
proc = AutoProcessor.from_pretrained("google/gemma-3-4b-it", trust_remote_code=True, use_fast=True)
proc.save_pretrained(TARGET_DIR)

# Tokenizer (ensure fast tokenizer saved if available)
tok = AutoTokenizer.from_pretrained("google/gemma-3-4b-it", trust_remote_code=True, use_fast=True)
tok.save_pretrained(TARGET_DIR)

print("Saved processor and tokenizer (use_fast=True) to:", TARGET_DIR)

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]
Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 709.58it/s]


Saved processor and tokenizer (use_fast=True) to: /mnt/d/Model Folder/modcord_custom_models/gemma-3-4b-it-nf4
