In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
from peft import PeftModel
import gradio as gr
from accelerate import infer_auto_device_map, init_empty_weights

# --- 1. Definujte cesty a konfigurace ---
base_model_name_or_path = "/home/luk-hrd/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe"
lora_adapter_path = "mistral-7b-quantum-instruct-v1/final_adapter" 

# --- 2. Upravená konfigurace kvantizace ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# --- 3. Načtení modelu s vlastní device_map ---
print("Načítám kvantizovaný model a tokenizer...")

# Vytvoříme si vlastní device_map místo "auto"
with init_empty_weights():
    model_empty = AutoModelForCausalLM.from_pretrained(
        base_model_name_or_path, 
        config=AutoConfig.from_pretrained(base_model_name_or_path)
    )

# !!! ZDE JE FINÁLNÍ OPRAVA: Odstraněn argument 'quantization_config' !!!
device_map = infer_auto_device_map(
    model_empty,
    dtype=torch.bfloat16,
    no_split_module_classes=["MistralDecoderLayer"]
)
print("Vygenerovaná device_map:")
print(device_map)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name_or_path,
    quantization_config=bnb_config, # Tento argument zde zůstává, je to správně
    device_map=device_map,
)

# Zbytek kódu je stejný
tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Aplikuji LoRA adaptér...")
model = PeftModel.from_pretrained(model, lora_adapter_path)
model.eval()

print("Model je připraven!")


# --- Funkce pro Gradio a spuštění UI (beze změny) ---
def chat_with_model_gradio(message, history):
    history_prompt = ""
    for user_msg, assistant_msg in history:
        history_prompt += f"[INST] {user_msg} [/INST] {assistant_msg}</s>"

    final_prompt = f"<s>{history_prompt}[INST] {message} [/INST]"

    inputs = tokenizer(final_prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    clean_response = response.split("[/INST]")[-1].strip()
    
    return clean_response

iface = gr.ChatInterface(
    fn=chat_with_model_gradio,
    title="PhD AI Assistant (Quantum Mechanics)",
    description="Chat with me about quantum mechanics and computing.",
    theme="soft",
     examples=[
        ["Explain the Bell state in detail."],
        ["Write a Qiskit code snippet to create a GHZ state."],
    ]
)

iface.launch()

Načítám kvantizovaný model a tokenizer...


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.52s/it]


Vygenerovaná device_map:
OrderedDict([('', 'cpu')])


Loading checkpoint shards: 100%|██████████| 2/2 [01:48<00:00, 54.35s/it]


Aplikuji LoRA adaptér...


  self.chatbot = Chatbot(


Model je připraven!
* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




In [None]:
import torch

if torch.cuda.is_available():
    print("GPU je dostupná!")
    print(f"Název zařízení: {torch.cuda.get_device_name(0)}")
    print(f"Celková paměť: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Volná paměť: {torch.cuda.mem_get_info(0)[0] / 1e9:.2f} GB")
else:
    print("POZOR: GPU není dostupná. PyTorch používá pouze CPU.")

GPU je dostupná!
Název zařízení: NVIDIA GeForce RTX 4070 Laptop GPU
Celková paměť: 8.32 GB
Volná paměť: 0.57 GB


: 