In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
model_id = "openai-community/gpt2" #define model id

In [None]:
#loads model with 4 bit and saves bfloat16 for calculations
bnb_config = BitsAndBytesConfig( #tells model how to compress and handle weights
    load_in_4bit=True, #loads weights in 4 bit integer format (instead of 16 or 32 bits) to reduce memory footprint
    bnb_4bit_quant_type="nf4", #use normal 4 bit (n=normal, f=float, 4=4bit)
    bnb_4bit_compute_dtype=torch.bfloat16 #data used to compute after 4 bit weights are loaded and decompressed
)
print(f"--- 1. Starting GPT 2 Model Load \"{model_id}\" in 4-bit ---")

--- 1. Starting GPT Model Load "openai/gpt-oss-20b" in 4-bit ---


In [None]:
try: #goes to hugging face and download llama
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config, #passes 4 bit configuration
        device_map="auto", #manages device placement
        #trust_remote_code=True #trust meta's Llama3.1 and excecute files
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print("GPT 2 Model and Tokenizer loaded successfully.")

    prompt = "Explain in one sentence why the sky is blue." #simple prompt to test chat
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    print(f"\n--- 2. Generating Simple Response for Prompt: '{prompt}' ---")

    with torch.no_grad():
        output_tokens = model.generate(
            **inputs, #unpacks dictionary
            max_new_tokens=50, #set max length (50 because we are doing fast testing)
            do_sample = False #false is highest probability (more predicatable), true would be based on distribution (more creative)
        )
        output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

        print("\n--- 3. Generation Complete ---")
        print(f"Generated Output:\n{output_text}")
        print("\n Test complete. GPT 2 loading and generation are working.")
except Exception as e:
    print(f"ERROR: {e}")
    print("Ensure you have accepted the license on Hugging Face and your environment is active.")


ERROR: The model is quantized with Mxfp4Config but you are passing a BitsAndBytesConfig config. Please make sure to pass the same quantization config class to `from_pretrained` with different loading attributes.
Ensure you have accepted the license on Hugging Face and your environment is active.
