In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# Quantization: Load in 8-bit to save VRAM
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# ✅ Load Model (let HuggingFace decide offloading for max safety)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    #attn_implementation="flash_attention_2",
    device_map="auto",  # avoid custom map unless necessary
    #torch_dtype=torch.float16,  # Flash-Attn2 works best with fp16
)

# ✅ Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Clear any garbage before generation
torch.cuda.empty_cache()

Loading checkpoint shards:  16%|█▌        | 3/19 [01:21<09:34, 35.94s/it]

In [4]:
# ✅ Prepare input and send it to the same device as model input embeddings
input_text = "Explain quantum entanglement like I'm five."
inputs = tokenizer(input_text, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# ✅ Generate
with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=11)  # reduce if needed

print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Explain quantum entanglement like I'm five.

Quantum entanglement is a special connection
