In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4"
)

# Load and save
model_name = "tiiuae/Falcon3-10B-Instruct-1.58bit"
save_directory = "./falcon3_10b_instruct"

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)

print(f"Saving to {save_directory}...")
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print("Reloading from local directory...")
local_tokenizer = AutoTokenizer.from_pretrained(save_directory, trust_remote_code=True)
local_model = AutoModelForCausalLM.from_pretrained(
    save_directory,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)

In [1]:
#!/usr/bin/env python3

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

def main():
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # for better error traces

    model_name = "tiiuae/Falcon3-10B-Instruct-1.58bit"

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4"
    )

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    print(f"Loading Falcon model in BF16 base from: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        device_map="auto",
        torch_dtype=torch.bfloat16  # base weights in BF16
    )

    # Remove built-in config if it exists
    if hasattr(model.config, "quantization_config"):
        del model.config.quantization_config

    # Attach our BitsAndBytes config
    model.config.quantization_config = bnb_config

    prompt = """You are Falcon, a helpful AI assistant.
User: List 3 healthy dinner ideas
Falcon:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Try sampling with very conservative parameters
    print("\nGenerating with sampling (temperature=0.3, top_p=1.0)...")
    try:
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=64,
                do_sample=True,
                temperature=0.7,
                top_p=1.0,
                pad_token_id=tokenizer.eos_token_id
            )
        response = tokenizer.decode(
            output_ids[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=True
        )
        print(f"\nSampling Output:\n{response.strip()}\n")

    except RuntimeError as e:
        print("RuntimeError during generation:", e)

if __name__ == "__main__":
    main()


Loading tokenizer...
Loading Falcon model in BF16 base from: tiiuae/Falcon3-10B-Instruct-1.58bit

Generating with sampling (temperature=0.3, top_p=1.0)...

Sampling Output:
Sure, I'd be happy to help! Here are three healthy dinner ideas: 1) Grilled salmon with quinoa and steamed broccoli; 2) Stir-fried tofu with brown rice and snap peas; 3) Baked cod with roasted sweet potatoes and green beans. Please remember to adjust portion sizes and choose ingredients

