<a href="https://colab.research.google.com/github/Maximi652/efficient-slm-architectures/blob/main/efficient_slm_architectures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q gradio
!pip install torch
!pip install transformers
!pip install -q huggingface_hub
!pip install -U bitsandbytes

In [None]:
# Trainingsdaten
import json

# Open the file in read mode ('r')
with open("/Training_Data/12B_combined_golden.json", 'r') as f:
    # Now you can load the JSON data from the file object 'f'
    training_data = json.load(f)

# print(json.dumps(training_data['questions'][0], indent=2))

In [None]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Modell direkt von Hugging Face laden
model_name = "Qwen/Qwen2.5-7B-Instruct"

# 4-Bit Konfiguration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Modell in 4-bit laden, direkt auf GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# Antwortfunktion für Gradio
def qwen_chat(prompt, history):
    history = history or []

    # Nachrichtenstruktur für Qwen
    messages = [{"role": "system", "content": "You are a helpful assistant."}]
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": prompt})

    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=1024,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
    history.append((prompt, response))
    return "", history

# Gradio-UI bauen
with gr.Blocks() as demo:
    gr.Markdown("## Qwen 2.5 7B Instruct Chatbot")

    chatbot = gr.Chatbot()
    state = gr.State([])

    with gr.Row():
        msg = gr.Textbox(
            show_label=False,
            placeholder="Schreib etwas und klick auf 'Absenden'...",
            lines=2
        )
        send_btn = gr.Button("📤 Absenden")

    # Eingabe abschicken
    send_btn.click(qwen_chat, [msg, state], [msg, chatbot])

    # Optional: Verlauf löschen
    clear_btn = gr.Button("🧹 Verlauf löschen")
    clear_btn.click(lambda: ([], []), None, [chatbot, state])

# Starten
demo.launch(debug=True)

In [None]:
# nach dem Testen ggf. den Server schließen
demo.close()

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Check Zuordnung
model.hf_device_map

In [None]:
# from huggingface_hub import snapshot_download

# # Zielverzeichnis
# target_dir = "./localQwen2.5-7B-Instruct"

# # Lade das Modell direkt von Hugging Face in diesen Ordner (spart RAM!)
# snapshot_download(
#     repo_id="Qwen/Qwen2.5-7B-Instruct",
#     local_dir=target_dir,
#     local_dir_use_symlinks=False
# )