In [None]:
!pip install gradio transformers torch
!pip install gradio_consilium_roundtable

In [None]:
# Modified from https://huggingface.co/spaces/tejasashinde/archRouter_simulator/blob/main/app.py
# Integrates OpenAI API for real task execution instead of simulating completion
import gradio as gr
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import ast
import requests
from openai import OpenAI
import os
# Removed import of gradio_consilium_roundtable
from google.colab import userdata

# Configuration
MODEL_NAME = "katanemo/Arch-Router-1.5B"
WAIT_DEPARTMENT = 5
WAIT_SYSTEM = 5
route_config = [
    {"name": "code_generation", "description": "Generating code based on prompts"},
    {"name": "creative_writing", "description": "Creative writing or storytelling"},
    {"name": "casual_conversation", "description": "General conversation or chit-chat"},
    {"name": "math_reasoning", "description": "Mathematical problems or logical reasoning"},
    {"name": "other", "description": "Any other request that does not fit a specific category"}
]
departments = {
    "code_generation": {"name": "Code Generation", "emoji": "💻"},
    "creative_writing": {"name": "Creative Writing", "emoji": "📝"},
    "casual_conversation": {"name": "Casual Conversation", "emoji": "💬"},
    "math_reasoning": {"name": "Math Reasoning", "emoji": "🧮"},
    "other": {"name": "Other", "emoji": "❓"}
}
# Map routes to OpenAI models
ROUTE_TO_MODEL = {
    "code_generation": "gpt-4o",
    "creative_writing": "gpt-4o",
    "casual_conversation": "gpt-4o-mini",
    "math_reasoning": "gpt-4o",
    "other": "gpt-4o-mini"
}

# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model.to(device)

def format_prompt(conversation, route_config):
    route_descriptions = "\n".join([f"<route name=\"{r['name']}\">{r['description']}</route>" for r in route_config])
    messages = []
    for msg in conversation:
        role = msg["role"]
        content = msg["content"]
        messages.append(f"**{role.capitalize()}**: {content}")
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id>

You are a router model. Analyze the conversation below and select the best route from the list. Output only a JSON object like {{"route": "route_name"}}. If no route matches, use "other".

Routes:
{route_descriptions}

Conversation:
{'\n'.join(messages)}<|eot_id|><|start_header_id|>user<|end_header_id>
"""

def parse_route(output):
    try:
        start = output.find("{")
        end = output.rfind("}") + 1
        json_str = output[start:end]
        return ast.literal_eval(json_str)["route"]
    except:
        return "other"

# init_state and visualization logic removed
# Replacing with a simple processing function
def process_query(input_text):
    if not input_text:
        return "Please enter a query."

    print("🔎 Identifying route, please wait...")

    conversation = [{"role": "user", "content": input_text}]
    prompt = format_prompt(conversation, route_config)
    inputs = tokenizer.apply_chat_template(
        conversation=[{"role": "system", "content": prompt}, {"role": "user", "content": input_text}],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(device)
    outputs = model.generate(inputs, max_new_tokens=512, do_sample=False)
    raw_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    print(f"Raw model output: {raw_output}")
    route = parse_route(raw_output)

    dept_name = departments.get(route, departments["other"])["name"]
    dept_emoji = departments.get(route, departments["other"])["emoji"]
    print(f"📌 Identified department: **{dept_name}**. Forwarding task...")

    print(f"{dept_emoji} {dept_name} is processing your request...")

    # Call OpenAI API
    try:
        # Retrieve API key from Colab Secrets Manager
        openai_api_key = userdata.get('OPENAI_API_KEY')
        client = OpenAI(api_key=openai_api_key)
        response = client.chat.completions.create(
            model=ROUTE_TO_MODEL.get(route, ROUTE_TO_MODEL["other"]),
            messages=[{"role": "user", "content": input_text}],
            max_tokens=1000,
            temperature=0.7
        )
        llm_output = response.choices[0].message.content
        print(f"✅ {dept_name} completed the task:\n\n{llm_output}")
        return llm_output
    except Exception as e:
        error_msg = f"❌ Error from {dept_name}: {str(e)}"
        print(error_msg)
        return error_msg

# Gradio UI
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
    gr.Markdown("""
    # Arch Router Simulation: Smart Department Dispatcher
    This demo simulates an AI router using the [katanemo/Arch-Router-1.5B](https://huggingface.co/katanemo/Arch-Router-1.5B) model.
    Enter a query to see it routed to the appropriate department (e.g., Code Generation, Creative Writing),
    which then generates a response using an OpenAI model.
    The routing process and the final response will be printed below.
    Try prompts like:
    - "Write Python code for a calculator that handles addition, subtraction, multiplication, and division with error handling."
    - "Write a short poem about the ocean."
    - "What's the weather like today?"
    """)

    input_text = gr.Textbox(label="Your Query", placeholder="Enter your query here...")
    output_text = gr.Textbox(label="LLM Output", interactive=False)
    submit_btn = gr.Button("Submit")
    example_prompts = gr.Examples(
        examples=[
            "Write Python code for a calculator that handles addition, subtraction, multiplication, and division with error handling.",
            "Write a short poem about the ocean.",
            "What's the weather like today?",
            "Solve the equation 2x + 5 = 15."
        ],
        inputs=[input_text]
    )

    submit_btn.click(
        fn=process_query,
        inputs=[input_text],
        outputs=[output_text]
    )
    input_text.submit(
        fn=process_query,
        inputs=[input_text],
        outputs=[output_text]
    )

demo.launch(debug="true")

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://27b85daa1d93ff7771.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


🔎 Identifying route, please wait...
Raw model output: {'route': 'creative_writing'}
📌 Identified department: **Creative Writing**. Forwarding task...
📝 Creative Writing is processing your request...
❌ Error from Creative Writing: Error code: 400 - {'error': {'message': "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7864 <> https://27b85daa1d93ff7771.gradio.live


