In [35]:
import pandas as pd
import litellm 
import numpy as np

from collections import Counter

from openai import OpenAI

In [2]:
def majority_vote(solutions):
    texts = [sol["score"] for sol in solutions]
    return Counter(texts).most_common(1)[0][0]

In [19]:
prompt_sfx = "Let's think step by step and output the final answer within \\boxed{}."

In [36]:
AVAILABLE_MODEL_CONFIGS = {
    "Qwen/Qwen2.5-Math-1.5B-Instruct": {
        "model_base": "http://localhost:8001/v1",
        "api_key": "token-abc123",
        "default_temperature": 0.6,
        "default_max_tokens": 3000,
        "mode_settings": {
            "MATH": {
                "prompt_sfx": prompt_sfx,
                "default_temperature": 0.6,
                "default_max_tokens": 3000,
            }
        },
        "model_costs": { # Based on Alibaba CLoud Pricing
            "input_per_mill": 0.10,
            "output_per_mill": 0.10
        }
    },
    "Qwen/Qwen2.5-Math-7B-Instruct": {
        "model_base": "http://localhost:8001/v2",
        "api_key": "token-abc123",
        "default_temperature": 0.6,
        "default_max_tokens": 3000,
         "mode_settings": {
            "MATH": {
                "prompt_sfx": prompt_sfx,
                "default_temperature": 0.6,
                "default_max_tokens": 3000,
            }
        },
        "model_costs": {
            "input_per_mill": 0.144,
            "output_per_mill": 0.287
        }
    },
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": {
        "model_base": "http://localhost:8001/v3",
        "api_key": "token-abc123",
        "default_temperature": 0.6,
        "default_max_tokens": 32768,
         "mode_settings": {
            "MATH": {
                "prompt_sfx": prompt_sfx,
                "default_temperature": 0.6,
                "default_max_tokens": 3000,
            }
        },
        "model_costs": {
            "input_per_mill": 0.574,
            "output_per_mill": 1.721
        }
    },

}

In [None]:
def get_completion(prompt, chosen_model, mode="default"):
    VLLM_CONFIG = AVAILABLE_MODEL_CONFIGS[chosen_model]
    client = OpenAI(
    api_key=VLLM_CONFIG["api_key"],
    base_url=VLLM_CONFIG["model_base"],
    )

    if mode == "MATH":
        completion = client.completions.create(
            model=chosen_model,
            prompt=prompt + VLLM_CONFIG["mode_settings"]["MATH"]["prompt_sfx"],
            temperature=VLLM_CONFIG["mode_settings"]["MATH"]["default_temperature"],
            max_tokens=VLLM_CONFIG["mode_settings"]["MATH"]["default_max_tokens"]
        )
    else:
        completion = client.completions.create(
            model=chosen_model,
            prompt=prompt,
            temperature=VLLM_CONFIG["default_temperature"],
            max_tokens=VLLM_CONFIG["default_max_tokens"]
        )
    return completion

In [26]:
completion = get_completion("What is 60 + 7?", chosen_model="Qwen/Qwen2.5-Math-1.5B-Instruct", mode="MATH")
print(completion.choices[0].text)

 To solve the problem \(60 + 7\), we can break it down into simpler steps:

1. Start with the number 60.
2. Add the number 7 to 60.

When we add 7 to 60, we get:

\[60 + 7 = 67\]

So, the final answer is \(\boxed{67}\).


In [28]:
completion

Completion(id='cmpl-b52e9c3b0d70439d863de228f62f63d9', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' To solve the problem \\(60 + 7\\), we can break it down into simpler steps:\n\n1. Start with the number 60.\n2. Add the number 7 to 60.\n\nWhen we add 7 to 60, we get:\n\n\\[60 + 7 = 67\\]\n\nSo, the final answer is \\(\\boxed{67}\\).', stop_reason=None, prompt_logprobs=None)], created=1766591073, model='Qwen/Qwen2.5-Math-1.5B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=83, prompt_tokens=24, total_tokens=107, completion_tokens_details=None, prompt_tokens_details=None), service_tier=None, kv_transfer_params=None)

In [34]:
completion.usage.completion_tokens,completion.usage.prompt_tokens

(83, 24)

In [None]:
def temperature_calibrate(p, T):
    logit = np.log(p / (1 - p))
    return 1 / (1 + np.exp(-logit / T))


def best_of_k(p, K, alpha=0.6):
    return 1 - (1 - p)**(alpha * K)


In [37]:
def estimate_cost(
    model_name: str,
    prompt_tokens: int,
    output_tokens: int,
    K: int,
):
    cfg = AVAILABLE_MODEL_CONFIGS[model_name]["model_costs"]

    input_cost = (prompt_tokens / 1_000_000) * cfg["input_per_mill"]
    output_cost = (output_tokens / 1_000_000) * cfg["output_per_mill"]

    return input_cost + K * output_cost

In [None]:
TARGET_SUCCESS = 0.8
TEMP_T = 1.5
ALPHA = 0.6
EXPECTED_PROMPT_TOKENS = 800      # MATH problems + CoT
EXPECTED_OUTPUT_TOKENS = 1200     # typical math reasoning


ROUTING_ACTIONS = [
    ("Qwen/Qwen2.5-Math-1.5B-Instruct", 1),
    ("Qwen/Qwen2.5-Math-1.5B-Instruct", 5),
    ("Qwen/Qwen2.5-Math-7B-Instruct", 1),
    ("Qwen/Qwen2.5-Math-7B-Instruct", 5),
    ("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", 1),
]

def route_question_with_pricing(
    probe_score: float,
    prompt_tokens: int = EXPECTED_PROMPT_TOKENS,
    output_tokens: int = EXPECTED_OUTPUT_TOKENS,
):
    # 1. calibrate probe output
    p = temperature_calibrate(probe_score, TEMP_T)

    best = None

    for model, K in ROUTING_ACTIONS:
        # predicted success
        p_kn = best_of_k(p, K, ALPHA)

        if p_kn < TARGET_SUCCESS:
            continue

        cost = estimate_cost(
            model_name=model,
            prompt_tokens=prompt_tokens,
            output_tokens=output_tokens,
            K=K,
        )

        if best is None or cost < best["cost"]:
            best = {
                "model": model,
                "K": K,
                "pred_success": p_kn,
                "cost": cost,
            }

    # fallback: strongest model, K=1
    if best is None:
        best = {
            "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
            "K": 1,
            "pred_success": best_of_k(p, 1, ALPHA),
            "cost": estimate_cost(
                "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
                prompt_tokens,
                output_tokens,
                1,
            )
        }

    return best

In [None]:
def solve_with_routing(prompt, probe_score):
    prompt_len = prompt
    route = route_question_with_pricing(probe_score, prompt_len)

    solutions = []
    for _ in range(route["K"]):
        completion = get_completion(
            prompt,
            chosen_model=route["model"],
            mode="MATH"
        )
        solutions.append({
            "text": completion.choices[0].text,
        })

    # For math, verifier > majority vote
    return solutions


# Log per question
# {
#   "probe_score": 0.23,
#   "chosen_model": "Qwen/Qwen2.5-Math-7B-Instruct",
#   "K": 1,
#   "predicted_success": 0.81,
#   "actual_success": true,
#   "estimated_cost_usd": 0.0027
# }
