<a href="https://colab.research.google.com/github/Maximi652/efficient-slm-architectures/blob/main/efficient_slm_architectures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q gradio
!pip install torch
!pip install transformers
!pip install -q huggingface_hub
!pip install -U bitsandbytes

In [None]:
# Trainingsdaten
import json

# Open the file in read mode ('r')
with open("/Training_Data/12B_combined_golden.json", 'r') as f:
    # Now you can load the JSON data from the file object 'f'
    training_data = json.load(f)

# print(json.dumps(training_data['questions'][0], indent=2))

In [None]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Modell direkt von Hugging Face laden
model_name = "Qwen/Qwen2.5-7B-Instruct"

# 4-Bit Konfiguration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Modell in 4-bit laden, direkt auf GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# Antwortfunktion für Gradio
def qwen_chat(prompt, history):
    history = history or []

    # Nachrichtenstruktur für Qwen
    messages = [{"role": "system", "content": "You are a helpful assistant."}]
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": prompt})

    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=1024,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
    history.append((prompt, response))
    return "", history

# Gradio-UI bauen
with gr.Blocks() as demo:
    gr.Markdown("## Qwen 2.5 7B Instruct Chatbot")

    chatbot = gr.Chatbot()
    state = gr.State([])

    with gr.Row():
        msg = gr.Textbox(
            show_label=False,
            placeholder="Schreib etwas und klick auf 'Absenden'...",
            lines=2
        )
        send_btn = gr.Button("📤 Absenden")

    # Eingabe abschicken
    send_btn.click(qwen_chat, [msg, state], [msg, chatbot])

    # Optional: Verlauf löschen
    clear_btn = gr.Button("🧹 Verlauf löschen")
    clear_btn.click(lambda: ([], []), None, [chatbot, state])

# Starten
demo.launch(debug=True)

In [4]:
# nach dem Testen ggf. den Server schließen
demo.close()

Closing server running on port: 7860


In [None]:
# Check GPU
!nvidia-smi

Thu Apr 17 20:32:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   54C    P0             28W /   70W |   13744MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Check Zuordnung
model.hf_device_map

{'model.embed_tokens': 'cpu',
 'model.layers.0': 'cpu',
 'model.layers.1': 'cpu',
 'model.layers.2': 'cpu',
 'model.layers.3': 'cpu',
 'model.layers.4': 'cpu',
 'model.layers.5': 'cpu',
 'model.layers.6': 'cpu',
 'model.layers.7': 'cpu',
 'model.layers.8': 'cpu',
 'model.layers.9': 'cpu',
 'model.layers.10': 'cpu',
 'model.layers.11': 'cpu',
 'model.layers.12': 'cpu',
 'model.layers.13': 'cpu',
 'model.layers.14': 'cpu',
 'model.layers.15': 'cpu',
 'model.layers.16': 'cpu',
 'model.layers.17': 'cpu',
 'model.layers.18': 'disk',
 'model.layers.19': 'disk',
 'model.layers.20': 'disk',
 'model.layers.21': 'disk',
 'model.layers.22': 'disk',
 'model.layers.23': 'disk',
 'model.layers.24': 'disk',
 'model.layers.25': 'disk',
 'model.layers.26': 'disk',
 'model.layers.27': 'disk',
 'model.norm': 'disk',
 'model.rotary_emb': 'disk',
 'lm_head': 'disk'}

In [None]:
# from huggingface_hub import snapshot_download

# # Zielverzeichnis
# target_dir = "./localQwen2.5-7B-Instruct"

# # Lade das Modell direkt von Hugging Face in diesen Ordner (spart RAM!)
# snapshot_download(
#     repo_id="Qwen/Qwen2.5-7B-Instruct",
#     local_dir=target_dir,
#     local_dir_use_symlinks=False
# )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

LICENSE:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

'/content/localQwen2.5-7B-Instruct'