<a href="https://colab.research.google.com/github/Maximi652/efficient-slm-architectures/blob/main/efficient_slm_architectures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q gradio
!pip install torch
!pip install transformers
!pip install -q huggingface_hub
!pip install -U bitsandbytes
!pip install hf_xet

In [4]:
# Trainingsdaten
import json

# Open the file in read mode ('r')
with open("12B_combined_golden.json", 'r') as f:
    # Now you can load the JSON data from the file object 'f'
    training_data = json.load(f)

# print(json.dumps(training_data['questions'][0], indent=2))

In [9]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "Qwen/Qwen3-4B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

# Antwortfunktion für Gradio
def qwen_chat(prompt, history):
    history = history or []

    # Nachrichtenstruktur für Qwen
    messages = [{"role": "system", "content": "You are a helpful assistant."}]
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": prompt})

    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=1024,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
    history.append((prompt, response))
    return "", history

# Gradio-UI bauen
with gr.Blocks() as demo:
    gr.Markdown("## Qwen 3 4B Chatbot")

    chatbot = gr.Chatbot()
    state = gr.State([])

    with gr.Row():
        msg = gr.Textbox(
            show_label=False,
            placeholder="Schreib etwas und klick auf 'Absenden'...",
            lines=2
        )
        send_btn = gr.Button("📤 Absenden")

    # Eingabe abschicken
    send_btn.click(qwen_chat, [msg, state], [msg, chatbot])

    # Optional: Verlauf löschen
    clear_btn = gr.Button("🧹 Verlauf löschen")
    clear_btn.click(lambda: ([], []), None, [chatbot, state])

# Starten
demo.launch(debug=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  chatbot = gr.Chatbot()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f335330699ce204587.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://f335330699ce204587.gradio.live




In [11]:
# nach dem Testen ggf. den Server schließen
demo.close()

Closing server running on port: 7860


In [12]:
# Check GPU
!nvidia-smi

Sun May 11 15:43:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   78C    P0             47W /   70W |   14714MiB /  15360MiB |     36%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [13]:
# Check Zuordnung
model.hf_device_map

{'model.embed_tokens': 0,
 'lm_head': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 0,
 'model.layers.10': 0,
 'model.layers.11': 0,
 'model.layers.12': 0,
 'model.layers.13': 0,
 'model.layers.14': 0,
 'model.layers.15': 0,
 'model.layers.16': 0,
 'model.layers.17': 0,
 'model.layers.18': 0,
 'model.layers.19': 0,
 'model.layers.20': 0,
 'model.layers.21': 0,
 'model.layers.22': 0,
 'model.layers.23': 0,
 'model.layers.24': 0,
 'model.layers.25': 'cpu',
 'model.layers.26': 'cpu',
 'model.layers.27': 'cpu',
 'model.layers.28': 'cpu',
 'model.layers.29': 'cpu',
 'model.layers.30': 'cpu',
 'model.layers.31': 'cpu',
 'model.layers.32': 'cpu',
 'model.layers.33': 'cpu',
 'model.layers.34': 'cpu',
 'model.layers.35': 'cpu',
 'model.norm': 'cpu',
 'model.rotary_emb': 'cpu'}

In [None]:
# from huggingface_hub import snapshot_download

# # Zielverzeichnis
# target_dir = "./localQwen2.5-7B-Instruct"

# # Lade das Modell direkt von Hugging Face in diesen Ordner (spart RAM!)
# snapshot_download(
#     repo_id="Qwen/Qwen2.5-7B-Instruct",
#     local_dir=target_dir,
#     local_dir_use_symlinks=False
# )