In [13]:
import pandas as pd
import numpy as np
import os
import random
import json
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import sys

repo_root = Path.cwd().parent.parent
sys.path.insert(0, str(repo_root))
from will_replication.my_utils.utils import SIMPLE_MODEL_POOL_CONFIG

In [41]:
# === Cost-aware routing strategies ===
# Scores represent success probabilities (P(success) = score)
# Higher score → higher confidence the model will succeed

# Define models in use (cost order)
MODELS_IN_USE = {
    "Qwen/Qwen2.5-Math-1.5B-Instruct": SIMPLE_MODEL_POOL_CONFIG.get("Qwen/Qwen2.5-Math-1.5B-Instruct", {}),
    "Qwen/Qwen2.5-Math-7B-Instruct": SIMPLE_MODEL_POOL_CONFIG.get("Qwen/Qwen2.5-Math-7B-Instruct", {}),
    "Qwen/Qwen2.5-Math-72B-Instruct": SIMPLE_MODEL_POOL_CONFIG.get("Qwen/Qwen2.5-Math-72B-Instruct", {}),
}

# Calculate cost ratios (relative to cheapest model)
def get_cost_ratios(models_dict):
    """Calculate cost multipliers relative to the cheapest model."""
    costs = {}
    for model, config in models_dict.items():
        output_cost = config.get("model_costs", {}).get("output_per_mill", 1.0)
        costs[model] = output_cost
    
    min_cost = min(costs.values())
    return {model: cost / min_cost for model, cost in costs.items()}

COST_RATIOS = get_cost_ratios(MODELS_IN_USE)
print(f"Cost ratios (relative to 1.5B): {COST_RATIOS}")

# Original routing strategies
def route_question_max_utility_bayesian_robust(row, target_conf):
    """
    Route based on both models being confident.
    - Both high confidence → use smallest model
    - Only 7B high confidence → use 7B
    - Neither high → escalate to 72B
    """
    if row["score_1.5B"] >= target_conf and row["score_7B"] >= target_conf:
        return "Qwen/Qwen2.5-Math-1.5B-Instruct"
    elif row["score_1.5B"] <= target_conf and row["score_7B"] >= target_conf:
        return "Qwen/Qwen2.5-Math-7B-Instruct"
    else:
        return "Qwen/Qwen2.5-Math-72B-Instruct"

def route_to_target_with_72B_cascade(row, target_conf):
    """
    Sequential cascade: try each model in order until success prob >= threshold.
    """
    if row["score_1.5B"] >= target_conf:
        return "Qwen/Qwen2.5-Math-1.5B-Instruct"
    elif row["score_7B"] >= target_conf:
        return "Qwen/Qwen2.5-Math-7B-Instruct"
    else:
        return "Qwen/Qwen2.5-Math-72B-Instruct"
    
def route_to_target_with_72B_robust(row, target_conf):
    """
    Robust variant: both must agree to use smallest model.
    """
    if row["score_1.5B"] >= target_conf and row["score_7B"] >= target_conf:
        return "Qwen/Qwen2.5-Math-1.5B-Instruct"
    elif row["score_7B"] >= target_conf:
        return "Qwen/Qwen2.5-Math-7B-Instruct"
    else:
        return "Qwen/Qwen2.5-Math-72B-Instruct"
    
def random_router(row):
    """
    Route randomly to one of the three models.
    Baseline strategy for comparison.
    """
    return random.choice([
        "Qwen/Qwen2.5-Math-1.5B-Instruct",
        "Qwen/Qwen2.5-Math-7B-Instruct",
        "Qwen/Qwen2.5-Math-72B-Instruct",
    ])


# === NEW: Cost-aware strategies ===

def route_with_confidence_disagreement(row, target_conf, disagreement_threshold=0.15):
    """
    Route considering disagreement between models.
    - If both confident AND agree: use 1.5B
    - If 7B confident: use 7B
    - Else: use 72B
    
    Args:
        row: data row with score_1.5B and score_7B
        target_conf: confidence threshold
        disagreement_threshold: max allowed score difference for agreement
    """
    disagreement = abs(row["score_1.5B"] - row["score_7B"])
    if row["score_1.5B"] >= target_conf and disagreement < disagreement_threshold:
        return "Qwen/Qwen2.5-Math-1.5B-Instruct"
    elif row["score_7B"] >= target_conf:
        return "Qwen/Qwen2.5-Math-7B-Instruct"
    else:
        return "Qwen/Qwen2.5-Math-72B-Instruct"

def route_with_cost_utility(row, target_conf, cost_ratios=None):
    """
    Route to maximize success probability per cost unit (cost-efficiency).
    For each model, compute: efficiency = success_prob / cost_ratio
    Choose model with highest efficiency that meets target confidence.
    
    Args:
        row: data row with score_1.5B and score_7B
        target_conf: minimum acceptable confidence threshold
        cost_ratios: dict of {model_name: cost_multiplier}
    """
    if cost_ratios is None:
        cost_ratios = COST_RATIOS
    
    # Efficiency = success_prob / cost_ratio
    candidates = []
    
    # 1.5B: cost ratio = 1.0
    if row["score_1.5B"] >= target_conf:
        candidates.append(("Qwen/Qwen2.5-Math-1.5B-Instruct", row["score_1.5B"] / cost_ratios["Qwen/Qwen2.5-Math-1.5B-Instruct"]))
    
    # 7B: cost ratio = 2.0
    if row["score_7B"] >= target_conf:
        candidates.append(("Qwen/Qwen2.5-Math-7B-Instruct", row["score_7B"] / cost_ratios["Qwen/Qwen2.5-Math-7B-Instruct"]))
    
    # If any model meets threshold, pick most efficient
    if candidates:
        best_model, _ = max(candidates, key=lambda x: x[1])
        return best_model
    
    # No model meets threshold → use 72B
    return "Qwen/Qwen2.5-Math-72B-Instruct"

def route_with_adjusted_thresholds(row, target_conf, cost_ratios=None):
    """
    Adjust confidence thresholds based on cost ratios.
    Cheaper models can be used with lower confidence thresholds.
    
    Adjustment formula: adjusted_threshold = target_conf - (cost_multiplier - 1) * sensitivity
    
    Args:
        row: data row with score_1.5B and score_7B
        target_conf: base confidence threshold for the most expensive decision
        cost_ratios: dict of {model_name: cost_multiplier}
    """
    if cost_ratios is None:
        cost_ratios = COST_RATIOS
    
    # Sensitivity parameter: how much to reduce threshold per cost unit
    # Higher sensitivity = more willing to accept lower confidence for cheaper models
    sensitivity = 0.05
    
    # Adjusted thresholds
    threshold_1_5B = target_conf - (cost_ratios["Qwen/Qwen2.5-Math-1.5B-Instruct"] - 1) * sensitivity
    threshold_7B = target_conf - (cost_ratios["Qwen/Qwen2.5-Math-7B-Instruct"] - 1) * sensitivity
    
    if row["score_1.5B"] >= threshold_1_5B:
        return "Qwen/Qwen2.5-Math-1.5B-Instruct"
    elif row["score_7B"] >= threshold_7B:
        return "Qwen/Qwen2.5-Math-7B-Instruct"
    else:
        return "Qwen/Qwen2.5-Math-72B-Instruct"

def route_with_expected_cost(row, target_conf, cost_ratios=None):
    """
    Route based on expected cost of failure.
    Expected cost = cost_ratio * (1 - success_prob)
    Choose model that minimizes expected cost while meeting target confidence.
    
    Args:
        row: data row with score_1.5B and score_7B
        target_conf: minimum acceptable confidence
        cost_ratios: dict of {model_name: cost_multiplier}
    """
    if cost_ratios is None:
        cost_ratios = COST_RATIOS
    
    candidates = []
    
    # 1.5B: expected cost = 1.0 * (1 - success_prob)
    if row["score_1.5B"] >= target_conf:
        expected_cost_1_5B = cost_ratios["Qwen/Qwen2.5-Math-1.5B-Instruct"] * (1 - row["score_1.5B"])
        candidates.append(("Qwen/Qwen2.5-Math-1.5B-Instruct", expected_cost_1_5B))
    
    # 7B: expected cost = 2.0 * (1 - success_prob)
    if row["score_7B"] >= target_conf:
        expected_cost_7B = cost_ratios["Qwen/Qwen2.5-Math-7B-Instruct"] * (1 - row["score_7B"])
        candidates.append(("Qwen/Qwen2.5-Math-7B-Instruct", expected_cost_7B))
    
    # If any model meets threshold, pick lowest expected cost
    if candidates:
        best_model, _ = min(candidates, key=lambda x: x[1])
        return best_model
    
    # No model meets threshold → use 72B
    return "Qwen/Qwen2.5-Math-72B-Instruct"

# merged_df[f"route_to_target_{TARGET_CONFIDENCE_EST}"] = merged_df.apply(
#     lambda row: "Qwen-Qwen2.5-Math-1.5B-Instruct" if row["score_1.5B"] >= TARGET_CONFIDENCE_EST else "Qwen-Qwen2.5-Math-7B-Instruct",
#     axis=1
# )


Cost ratios (relative to 1.5B): {'Qwen/Qwen2.5-Math-1.5B-Instruct': 1.0, 'Qwen/Qwen2.5-Math-7B-Instruct': 2.0, 'Qwen/Qwen2.5-Math-72B-Instruct': 9.0}


In [3]:
MODELS = ["Qwen-Qwen2.5-Math-1.5B-Instruct", "Qwen-Qwen2.5-Math-7B-Instruct"]
MODEL_ALIASES = ["Qwen-Qwen2.5-Math-1.5B-Instruct", "Qwen-Qwen2.5-Math-7B-Instruct"]
MODEL_NAMES = ["Qwen/Qwen2.5-Math-1.5B-Instruct", "Qwen/Qwen2.5-Math-7B-Instruct"]
PROBING_DATASET = "DigitalLearningGmbH_MATH-lighteval"

LABELLED_DATASET_LIST = ["opencompass_AIME2025","gneubig_aime-1983-2024", "openai_gsm8k", "DigitalLearningGmbH_MATH-lighteval" ]
LABELLED_DATASET=LABELLED_DATASET_LIST[1]

In [4]:
COLS_OF_INTEREST = ["problem_id", "problem", "score_raw", "score", "calibrated_raw_score", "calibrated_score", "original_solution"]

print("bayesian routing")

for LABELLED_DATASET in LABELLED_DATASET_LIST:
    merged_df = None
    for MODEL_NAME in MODELS:
        DATA_PATH = f"../../will_replication/probe_results/DATA/Labelled_SR/DigitalLearningGmbH_MATH-lighteval_probe/{LABELLED_DATASET}/{MODEL_NAME}_maxlen_3000_k_1_temp_0.0"
        FILENAME = os.path.join(DATA_PATH, "scored.parquet")
        temp_df = pd.read_parquet(FILENAME)
        # Add missing columns if necessary
        for col in ["calibrated_raw_score", "calibrated_score"]:
            if col not in temp_df.columns:
                temp_df[col] = np.nan
        temp_df = temp_df[COLS_OF_INTEREST].copy()
        size = MODEL_NAME.split('-')[-2]
        temp_df = temp_df.rename(columns={
            "score": f"score_{size}", #predicted difficulty score
            "score_raw": f"score_raw_{size}", #predicted difficulty score
            "calibrated_raw_score": f"calibrated_raw_score_{size}",
            "calibrated_score": f"calibrated_score_{size}",
        })

        if merged_df is None:
            merged_df = temp_df
        else:
            merged_df = pd.merge(
                merged_df,
                temp_df.drop(columns=["problem"]),
                on="problem_id",
                suffixes=('', f'_{size}')
            )
            
    print(f"Routing breakdown for {LABELLED_DATASET}:")
    TARGET_CONFIDENCE_EST = .90
    merged_df[f"route_to"] = merged_df.apply(
        lambda row: route_question_max_utility_bayesian_robust(row, TARGET_CONFIDENCE_EST), axis=1
    )
    display(merged_df[f'route_to'].value_counts())
    print()
    SAVE_DIR= f"../pika_cascade_trial/DigitalLearningGmbH_MATH-lighteval_probe/{LABELLED_DATASET}_routed"
    os.makedirs(SAVE_DIR, exist_ok=True)
    SAVE_PATH = f"{SAVE_DIR}/bayes_cascade_{TARGET_CONFIDENCE_EST}.parquet"
    merged_df.to_parquet(SAVE_PATH)

bayesian routing
Routing breakdown for opencompass_AIME2025:


route_to
Qwen/Qwen2.5-Math-72B-Instruct    15
Name: count, dtype: int64


Routing breakdown for gneubig_aime-1983-2024:


route_to
Qwen/Qwen2.5-Math-72B-Instruct     921
Qwen/Qwen2.5-Math-1.5B-Instruct      8
Qwen/Qwen2.5-Math-7B-Instruct        4
Name: count, dtype: int64


Routing breakdown for openai_gsm8k:


route_to
Qwen/Qwen2.5-Math-1.5B-Instruct    953
Qwen/Qwen2.5-Math-72B-Instruct     199
Qwen/Qwen2.5-Math-7B-Instruct      167
Name: count, dtype: int64


Routing breakdown for DigitalLearningGmbH_MATH-lighteval:


route_to
Qwen/Qwen2.5-Math-72B-Instruct     2459
Qwen/Qwen2.5-Math-1.5B-Instruct    2221
Qwen/Qwen2.5-Math-7B-Instruct       320
Name: count, dtype: int64




In [5]:
print("cascade routing")

for LABELLED_DATASET in LABELLED_DATASET_LIST:
    merged_df = None
    for MODEL_NAME in MODELS:
        DATA_PATH = f"../../will_replication/probe_results/DATA/Labelled_SR/DigitalLearningGmbH_MATH-lighteval_probe/{LABELLED_DATASET}/{MODEL_NAME}_maxlen_3000_k_1_temp_0.0"
        FILENAME = os.path.join(DATA_PATH, "scored.parquet")
        temp_df = pd.read_parquet(FILENAME)
        # Add missing columns if necessary
        for col in ["calibrated_raw_score", "calibrated_score"]:
            if col not in temp_df.columns:
                temp_df[col] = np.nan
        temp_df = temp_df[COLS_OF_INTEREST].copy()
        size = MODEL_NAME.split('-')[-2]
        temp_df = temp_df.rename(columns={
            "score": f"score_{size}", #predicted difficulty score
            "score_raw": f"score_raw_{size}", #predicted difficulty score
            "calibrated_raw_score": f"calibrated_raw_score_{size}",
            "calibrated_score": f"calibrated_score_{size}",
        })

        if merged_df is None:
            merged_df = temp_df
        else:
            merged_df = pd.merge(
                merged_df,
                temp_df.drop(columns=["problem"]),
                on="problem_id",
                suffixes=('', f'_{size}')
            )
            
    print(f"Routing breakdown for {LABELLED_DATASET}:")
    TARGET_CONFIDENCE_EST = .90
    merged_df[f"route_to"] = merged_df.apply(
        lambda row: route_to_target_with_72B_cascade(row, TARGET_CONFIDENCE_EST), axis=1
    )
    display(merged_df[f'route_to'].value_counts())
    print()
    SAVE_DIR= f"../pika_cascade_trial/DigitalLearningGmbH_MATH-lighteval_probe/{LABELLED_DATASET}_routed"
    os.makedirs(SAVE_DIR, exist_ok=True)
    SAVE_PATH = f"{SAVE_DIR}/cascade_{TARGET_CONFIDENCE_EST}.parquet"
    merged_df.to_parquet(SAVE_PATH)

cascade routing
Routing breakdown for opencompass_AIME2025:


route_to
Qwen/Qwen2.5-Math-72B-Instruct    15
Name: count, dtype: int64


Routing breakdown for gneubig_aime-1983-2024:


route_to
Qwen/Qwen2.5-Math-72B-Instruct     898
Qwen/Qwen2.5-Math-1.5B-Instruct     31
Qwen/Qwen2.5-Math-7B-Instruct        4
Name: count, dtype: int64


Routing breakdown for openai_gsm8k:


route_to
Qwen/Qwen2.5-Math-1.5B-Instruct    1012
Qwen/Qwen2.5-Math-7B-Instruct       167
Qwen/Qwen2.5-Math-72B-Instruct      140
Name: count, dtype: int64


Routing breakdown for DigitalLearningGmbH_MATH-lighteval:


route_to
Qwen/Qwen2.5-Math-1.5B-Instruct    2506
Qwen/Qwen2.5-Math-72B-Instruct     2174
Qwen/Qwen2.5-Math-7B-Instruct       320
Name: count, dtype: int64




In [6]:
merged_df

Unnamed: 0,problem_id,problem,score_raw_1.5B,score_1.5B,calibrated_raw_score_1.5B,calibrated_score_1.5B,original_solution,score_raw_7B,score_7B,calibrated_raw_score_7B,calibrated_score_7B,original_solution_7B,route_to
0,SG93IG1hbnkgdmVydGljYWwgYXN5bXB0b3RlcyBkb2VzIH...,How many vertical asymptotes does the graph of...,4.099162,0.983684,,,The denominator of the rational function facto...,4.216038,0.985458,,,The denominator of the rational function facto...,Qwen/Qwen2.5-Math-1.5B-Instruct
1,V2hhdCBpcyB0aGUgcG9zaXRpdmUgZGlmZmVyZW5jZSBiZX...,What is the positive difference between $120\%...,5.184649,0.994429,,,One hundred twenty percent of 30 is $120\cdot3...,5.222956,0.994638,,,One hundred twenty percent of 30 is $120\cdot3...,Qwen/Qwen2.5-Math-1.5B-Instruct
2,RmluZCAkeCQgc3VjaCB0aGF0ICRcbGNlaWwgeCBccmNlaW...,Find $x$ such that $\lceil x \rceil + x = \dfr...,2.253886,0.904985,,,"First, we note that $x$ must be positive, sinc...",2.239801,0.903767,,,"First, we note that $x$ must be positive, sinc...",Qwen/Qwen2.5-Math-1.5B-Instruct
3,RXZhbHVhdGUgJGleNStpXnstMjV9K2leezQ1fSQuIExldC...,Evaluate $i^5+i^{-25}+i^{45}$. Let's think ste...,4.523513,0.989266,,,We have $i^5 = i^4\cdot i = 1\cdot (i) = i$. ...,3.773088,0.977535,,,We have $i^5 = i^4\cdot i = 1\cdot (i) = i$. ...,Qwen/Qwen2.5-Math-1.5B-Instruct
4,SWYgJDJeOD00XngkLCB3aGF0IGlzIHRoZSB2YWx1ZSBvZi...,"If $2^8=4^x$, what is the value of $x$? Let's ...",5.323208,0.995147,,,Rewrite $4$ as $2^2$ to find $4^x=2^{2x}$. Si...,5.178513,0.994395,,,Rewrite $4$ as $2^2$ to find $4^x=2^{2x}$. Si...,Qwen/Qwen2.5-Math-1.5B-Instruct
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,SWYgJFxzaW4geCArIFxjb3MgeCA9IFxmcmFjezF9ezV9JC...,If $\sin x + \cos x = \frac{1}{5}$ and $0 < x ...,1.564911,0.827057,,,"From the given equation, $\cos x = \frac{1}{5}...",2.713792,0.937836,,,"From the given equation, $\cos x = \frac{1}{5}...",Qwen/Qwen2.5-Math-7B-Instruct
4996,VGhlIG1hdHJpeCBmb3IgcHJvamVjdGluZyBvbnRvIGEgY2...,The matrix for projecting onto a certain plane...,0.270600,0.567240,,,Since $\begin{pmatrix} a \\ b \\ c \end{pmatri...,0.770409,0.683609,,,Since $\begin{pmatrix} a \\ b \\ c \end{pmatri...,Qwen/Qwen2.5-Math-72B-Instruct
4997,TGV0ICRcbWF0aGJme2F9LCQgJFxtYXRoYmZ7Yn0sJCBhbm...,"Let $\mathbf{a},$ $\mathbf{b},$ and $\mathbf{c...",2.765699,0.940794,,,Since $\mathbf{a} + \mathbf{b} + \mathbf{c} = ...,3.471947,0.969879,,,Since $\mathbf{a} + \mathbf{b} + \mathbf{c} = ...,Qwen/Qwen2.5-Math-1.5B-Instruct
4998,RmluZCB0aGUgc21hbGxlc3QgcG9zaXRpdmUgaW50ZWdlci...,Find the smallest positive integer solution to...,0.698380,0.667828,,,"By the tangent addition formula,\n\begin{align...",1.584898,0.829897,,,"By the tangent addition formula,\n\begin{align...",Qwen/Qwen2.5-Math-72B-Instruct


In [42]:
# === Routing Strategy Comparison ===
# Test all routing strategies on a single dataset

print("=" * 80)
print("ROUTING STRATEGY COMPARISON")
print("=" * 80)

TARGET_CONFIDENCE_EST = 0.90
LABELLED_DATASET = LABELLED_DATASET_LIST[2]  # Test on first dataset

# Load data
merged_df = None
for MODEL_NAME in MODELS:
    DATA_PATH = f"../../will_replication/probe_results/DATA/Labelled_SR/DigitalLearningGmbH_MATH-lighteval_probe/{LABELLED_DATASET}/{MODEL_NAME}_maxlen_3000_k_1_temp_0.0"
    FILENAME = os.path.join(DATA_PATH, "scored.parquet")
    temp_df = pd.read_parquet(FILENAME)
    # Add missing columns if necessary
    for col in ["calibrated_raw_score", "calibrated_score"]:
        if col not in temp_df.columns:
            temp_df[col] = np.nan
    temp_df = temp_df[COLS_OF_INTEREST].copy()
    size = MODEL_NAME.split('-')[-2]
    temp_df = temp_df.rename(columns={
        "score": f"score_{size}",
        "score_raw": f"score_raw_{size}",
        "calibrated_raw_score": f"calibrated_raw_score_{size}",
        "calibrated_score": f"calibrated_score_{size}",
    })

    if merged_df is None:
        merged_df = temp_df
    else:
        merged_df = pd.merge(
            merged_df,
            temp_df.drop(columns=["problem"]),
            on="problem_id",
            suffixes=('', f'_{size}')
        )

# Apply all routing strategies
strategies = {
    "Random": lambda row: random_router(row),
    "Cascade": lambda row: route_to_target_with_72B_cascade(row, TARGET_CONFIDENCE_EST),
    "Bayesian Robust": lambda row: route_question_max_utility_bayesian_robust(row, TARGET_CONFIDENCE_EST),
    "72B Robust": lambda row: route_to_target_with_72B_robust(row, TARGET_CONFIDENCE_EST),
    "Disagreement (±0.15)": lambda row: route_with_confidence_disagreement(row, TARGET_CONFIDENCE_EST, 0.15),
    "Cost-Utility": lambda row: route_with_cost_utility(row, TARGET_CONFIDENCE_EST, COST_RATIOS),
    "Adjusted Thresholds": lambda row: route_with_adjusted_thresholds(row, TARGET_CONFIDENCE_EST, COST_RATIOS),
    "Expected Cost": lambda row: route_with_expected_cost(row, TARGET_CONFIDENCE_EST, COST_RATIOS),
}

print(f"\nDataset: {LABELLED_DATASET}")
print(f"Target Confidence Threshold: {TARGET_CONFIDENCE_EST}")
print(f"Total Questions: {len(merged_df)}")
print(f"\nCost Ratios (relative to 1.5B):")
for model, ratio in COST_RATIOS.items():
    print(f"  {model.split('/')[-1]}: {ratio:.2f}x")

print("\n" + "-" * 80)

comparison_results = []
for strategy_name, strategy_func in strategies.items():
    merged_df[f"route_{strategy_name}"] = merged_df.apply(strategy_func, axis=1)
    
    print(f"\n{strategy_name.upper()}:")
    counts = merged_df[f"route_{strategy_name}"].value_counts()
    
    # Calculate metrics
    total = len(merged_df)
    for model, count in counts.items():
        pct = 100 * count / total
        cost_ratio = COST_RATIOS.get(model, 1.0)
        weighted_cost = (count / total) * cost_ratio
        print(f"  {model.split('/')[-1]}: {count:5d} ({pct:5.1f}%) | Weighted cost: {weighted_cost:.3f}x")
    
    # Average weighted cost
    avg_cost = sum((counts.get(model, 0) / total) * COST_RATIOS.get(model, 1.0) 
                   for model in COST_RATIOS.keys())
    print(f"  → Average Cost: {avg_cost:.3f}x")
    
    comparison_results.append({
        "Strategy": strategy_name,
        "Avg Cost": avg_cost,
        "1.5B %": 100 * counts.get("Qwen/Qwen2.5-Math-1.5B-Instruct", 0) / total,
        "7B %": 100 * counts.get("Qwen/Qwen2.5-Math-7B-Instruct", 0) / total,
        "72B %": 100 * counts.get("Qwen/Qwen2.5-Math-72B-Instruct", 0) / total,
    })

print("\n" + "=" * 80)
print("SUMMARY TABLE:")
comparison_df = pd.DataFrame(comparison_results)
print(comparison_df.to_string(index=False))

ROUTING STRATEGY COMPARISON

Dataset: openai_gsm8k
Target Confidence Threshold: 0.9
Total Questions: 1319

Cost Ratios (relative to 1.5B):
  Qwen2.5-Math-1.5B-Instruct: 1.00x
  Qwen2.5-Math-7B-Instruct: 2.00x
  Qwen2.5-Math-72B-Instruct: 9.00x

--------------------------------------------------------------------------------

RANDOM:
  Qwen2.5-Math-7B-Instruct:   445 ( 33.7%) | Weighted cost: 0.675x
  Qwen2.5-Math-1.5B-Instruct:   443 ( 33.6%) | Weighted cost: 0.336x
  Qwen2.5-Math-72B-Instruct:   431 ( 32.7%) | Weighted cost: 2.941x
  → Average Cost: 3.951x

CASCADE:
  Qwen2.5-Math-1.5B-Instruct:  1012 ( 76.7%) | Weighted cost: 0.767x
  Qwen2.5-Math-7B-Instruct:   167 ( 12.7%) | Weighted cost: 0.253x
  Qwen2.5-Math-72B-Instruct:   140 ( 10.6%) | Weighted cost: 0.955x
  → Average Cost: 1.976x

BAYESIAN ROBUST:
  Qwen2.5-Math-1.5B-Instruct:   953 ( 72.3%) | Weighted cost: 0.723x
  Qwen2.5-Math-72B-Instruct:   199 ( 15.1%) | Weighted cost: 1.358x
  Qwen2.5-Math-7B-Instruct:   167 ( 12.7%)