In [4]:
import os
import json
import pandas as pd
import sys
import numpy as np

import time
import gc
from dataclasses import dataclass
from typing import Dict, Any, List, Optional

import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
from pathlib import Path


repo_root = Path.cwd().parent.parent
sys.path.insert(0, str(repo_root))

from will_replication.my_utils.utils import load_probe_data, sigmoid_np, load_labelled_probe_dataset, SIMPLE_MODEL_POOL_CONFIG, ModelConfig
from thom_replication.utils.verification_math import compute_score, extract_solution

In [2]:
# PROBING CONGIG CSETUP
PROBE_RESULTS_DIR = "../../will_replication/probe_results/DATA"
LABELLED_SR_PATH = f"{PROBE_RESULTS_DIR}/Labelled_SR"
PROBE_DATA_PATH=f"{PROBE_RESULTS_DIR}/SR_DATA"
PROBING_DATASET = "MATH"

LABELLED_DATASET_FULL_NAME = "gneubig/aime-1983-2024"
LABELLED_DATASET_NAME = "_".join(LABELLED_DATASET_FULL_NAME.split("/"))

PROBE_MODEL_NAME = "Qwen/Qwen2.5-Math-1.5B-Instruct"
MODEL_ALIAS = "-".join(PROBE_MODEL_NAME.split("/"))
K=1
TEMPERATURE=0.0


small_model_probe_data = load_probe_data(MODEL_NAME=PROBE_MODEL_NAME, PROBING_DATASET=PROBING_DATASET, K=K, TEMPERATURE=TEMPERATURE, DATA_PATH=PROBE_DATA_PATH)

labelled_datast_df = load_labelled_probe_dataset(MODEL_NAME=PROBE_MODEL_NAME, PROBE_SOURCE_DATASET=PROBING_DATASET, LABELLED_DATASET=LABELLED_DATASET_NAME, K=K, TEMPERATURE=TEMPERATURE, DATA_PATH=LABELLED_SR_PATH)

In [3]:
routing_dataset_df = labelled_datast_df.copy().sample(n=100, random_state=42)

# Route questions to a model

In [4]:
MODEL_POOL = list(SIMPLE_MODEL_POOL_CONFIG.keys())
MODEL_POOL

['Qwen/Qwen2.5-Math-1.5B-Instruct',
 'Qwen/Qwen2.5-Math-7B-Instruct',
 'Qwen/Qwen2.5-Math-72B-Instruct']

In [5]:
routing_dataset_df["route_to"] = PROBE_MODEL_NAME

In [6]:
routing_dataset_df.head(1)

Unnamed: 0,idx,dataset,prompt_scored,formatted,score_raw,score,layer,pos,original_solution,Year,Problem Number,route_to
830,830,gneubig/aime-1983-2024,Zou and Chou are practicing their $100$ -meter...,<|im_start|>system\nPlease reason step by step...,0.956127,0.722346,18,-1,97,2021,1,Qwen/Qwen2.5-Math-1.5B-Instruct


In [7]:
def route_questions(predicted_score:int, model_pool:list[str]):
    if predicted_score  >= 0.8: #really easy go to model 1 (0.8 - 1.0)
        return model_pool[0]
    elif predicted_score >= 0.5: #medium go to model 2  (0.5 - 0.8)
        return model_pool[1]
    else:
        return model_pool[2] #realy hard go to model 3 (0 - 0.49)

In [8]:
routing_dataset_df["route_to"] = routing_dataset_df["score"].apply(lambda x: route_questions(x, MODEL_POOL))

In [9]:
routing_dataset_df["route_to"].value_counts()

route_to
Qwen/Qwen2.5-Math-72B-Instruct     66
Qwen/Qwen2.5-Math-7B-Instruct      25
Qwen/Qwen2.5-Math-1.5B-Instruct     9
Name: count, dtype: int64

In [10]:
# Create a dictionary of DataFrames, each subset grouped by 'route_to'
route_to_subsets = {route: group for route, group in routing_dataset_df.groupby('route_to')}

In [11]:
list(route_to_subsets.keys())

['Qwen/Qwen2.5-Math-1.5B-Instruct',
 'Qwen/Qwen2.5-Math-72B-Instruct',
 'Qwen/Qwen2.5-Math-7B-Instruct']

In [12]:
len(route_to_subsets[list(route_to_subsets.keys())[0]]), len(route_to_subsets[list(route_to_subsets.keys())[1]]), len(route_to_subsets[list(route_to_subsets.keys())[2]])

(9, 66, 25)

# Answer each question with the model

In [13]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [14]:
@dataclass
class VLLMModelRunCfg:
    tensor_parallel_size: int = 1
    gpu_memory_utilization: float = 0.90
    max_model_len: int = 4096

In [15]:
params = SamplingParams(temperature=TEMPERATURE, n=1)

In [16]:
def unload_model(llm: LLM) -> None:
    try:
        if hasattr(llm, "llm_engine"):
            del llm.llm_engine
    except Exception:
        pass
    del llm
    gc.collect()
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

In [17]:
print(routing_dataset_df["prompt_scored"].iloc[0])

Zou and Chou are practicing their $100$ -meter sprints by running $6$ races against each other. Zou wins the first race, and after that, the probability that one of them wins a race is $\frac23$ if they won the previous race but only $\frac13$ if they lost the previous race. The probability that Zou will win exactly $5$ of the $6$ races is $\frac mn$ , where $m$ and $n$ are relatively prime positive integers. Find $m+n$ . Let's think step by step and output the final answer within \boxed{}.


In [18]:
def batch_apply_chat_template(problems, tokenizer):
    prompt_store = []
    for problem in problems:
        messages = [
            {"role": "user", "content": problem}
        ]
        prompts = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        prompt_store.append(prompts)
    return prompt_store

def count_input_tokens_batch(prompts: list[str], tokenizer) -> list[int]:
    enc = tokenizer(prompts, add_special_tokens=False)
    # HF returns dict with "input_ids": List[List[int]]
    return [len(ids) for ids in enc["input_ids"]]

TOKENS_PER_MILLION = 1_000_000

In [19]:
def run_routed_vllm_inference(
    df: pd.DataFrame,
    *,
    route_col: str,
    prompt_col: str,
    out_text_col: str = "response_text",
    out_model_col: str = "response_model",
    input_num_tokens_col: str = "input_num_tokens",
    out_tok_col: str = "response_num_tokens",
    out_latency_col: str = "response_latency_s",
    out_err_col: str = "response_error",
    input_cost_col: str = "input_cost_usd",
    output_cost_col: str = "output_cost_usd",
    total_cost_col: str = "total_cost_usd",
    pricing_config: Optional[dict] = None,
    temperature: float = 0.0,
    max_tokens: int = 3000,
    n: int = 1,
    batch_size: int = 32,
    checkpoint_path: Optional[str] = None,
    model_run_cfgs: Optional[Dict[str, VLLMModelRunCfg]] = None,
) -> pd.DataFrame:
    """
    Runs vLLM inference grouped by df[route_col], and writes results back into df.
    Safe to resume if out_text_col already filled.
    """

    if model_run_cfgs is None:
        model_run_cfgs = {}

    if pricing_config is None:
        # pass SIMPLE_MODEL_POOL_CONFIG here when you call the function
        pricing_config = {}

    # Ensure output columns exist
    for col, default in [
        (out_text_col, None),
        (out_model_col, None),
        (input_num_tokens_col, np.nan),
        (out_tok_col, np.nan),
        (out_latency_col, np.nan),
        (out_err_col, None),
        (input_cost_col, np.nan),
        (output_cost_col, np.nan),
        (total_cost_col, np.nan),
    ]:
        if col not in df.columns:
            df[col] = default

    # Only process rows without outputs yet
    pending_mask = df[out_text_col].isna()
    if pending_mask.sum() == 0:
        print("✅ Nothing to do: all rows already have responses.")
        return df

    routes = df.loc[pending_mask, route_col].dropna().unique().tolist()
    print(f"Routes to run: {routes}")

    for model_name in routes:
        model_mask = pending_mask & (df[route_col] == model_name)
        idxs = df.index[model_mask].tolist()
        if not idxs:
            continue

        cfg = model_run_cfgs.get(model_name, VLLMModelRunCfg())
        print(f"\n=== Running model: {model_name} | rows: {len(idxs)} ===")
        print(f"vLLM cfg: {cfg}")

        # Pull pricing for this model (if available)
        model_costs = pricing_config.get(model_name, {}).get("model_costs", {})
        in_rate = model_costs.get("input_per_mill", None)   # USD per 1,000,000 input tokens
        out_rate = model_costs.get("output_per_mill", None) # USD per 1,000,000 output tokens
        has_pricing = (in_rate is not None) and (out_rate is not None)
        if not has_pricing:
            print(f"⚠️ No pricing found for {model_name} in pricing_config[...]['model_costs']; costs will be NaN.")

        if "72" in model_name:
            llm = LLM(
                model=model_name,
                tensor_parallel_size=cfg.tensor_parallel_size,
                gpu_memory_utilization=cfg.gpu_memory_utilization,
                max_model_len=cfg.max_model_len,
                max_num_seqs=64,
                max_num_batched_tokens=8192,
                # enforce_eager=True
            )
        else:
            llm = LLM(
                model=model_name,
                tensor_parallel_size=cfg.tensor_parallel_size,
                gpu_memory_utilization=cfg.gpu_memory_utilization,
                max_model_len=cfg.max_model_len,
            )

        sampling = SamplingParams(
            temperature=temperature,
            max_tokens=max_tokens,
            n=n,
        )

        tokenizer = llm.llm_engine.tokenizer.tokenizer


        # Process in batches
        for start in tqdm(range(0, len(idxs), batch_size), desc=f"Inferencing {model_name}"):
            batch_idxs = idxs[start : start + batch_size]
            problems = df.loc[batch_idxs, prompt_col].tolist()

            prompts = batch_apply_chat_template(problems, tokenizer)
            input_tok_counts = count_input_tokens_batch(prompts, tokenizer)


            t0 = time.time()
            try:
                outputs = llm.generate(prompts, sampling_params=sampling)
                latency = time.time() - t0

                # vLLM returns outputs aligned with prompts
                texts = []
                out_tok_counts = []
                errs = [None] * len(outputs)

                for out in outputs:
                    # If n>1 you might want list; here we take first completion by default
                    comp = out.outputs[0]
                    texts.append(comp.text)
                    out_tok_counts.append(len(comp.token_ids) if comp.token_ids is not None else np.nan)

                df.loc[batch_idxs, out_text_col] = texts
                df.loc[batch_idxs, out_model_col] = model_name
                df.loc[batch_idxs, input_num_tokens_col] = input_tok_counts
                df.loc[batch_idxs, out_tok_col] = out_tok_counts
                df.loc[batch_idxs, out_latency_col] = latency
                df.loc[batch_idxs, out_err_col] = errs

                # Compute + write costs (vectorized on the batch)
                if has_pricing:
                    in_arr = np.array(input_tok_counts, dtype=float)
                    out_arr = np.array(out_tok_counts, dtype=float)

                    input_costs = (in_arr / TOKENS_PER_MILLION) * float(in_rate)
                    output_costs = (out_arr / TOKENS_PER_MILLION) * float(out_rate)
                    total_costs = input_costs + output_costs

                    df.loc[batch_idxs, input_cost_col] = input_costs
                    df.loc[batch_idxs, output_cost_col] = output_costs
                    df.loc[batch_idxs, total_cost_col] = total_costs
                else:
                    df.loc[batch_idxs, input_cost_col] = np.nan
                    df.loc[batch_idxs, output_cost_col] = np.nan
                    df.loc[batch_idxs, total_cost_col] = np.nan

            except Exception as e:
                latency = time.time() - t0
                # record error per-row so you can retry later
                df.loc[batch_idxs, out_text_col] = None
                df.loc[batch_idxs, out_model_col] = model_name
                df.loc[batch_idxs, input_num_tokens_col] = np.nan
                df.loc[batch_idxs, out_tok_col] = np.nan
                df.loc[batch_idxs, out_latency_col] = latency
                df.loc[batch_idxs, out_err_col] = repr(e)

                # also blank costs on error
                df.loc[batch_idxs, input_cost_col] = np.nan
                df.loc[batch_idxs, output_cost_col] = np.nan
                df.loc[batch_idxs, total_cost_col] = np.nan
                
            # checkpoint frequently (optional)
            if checkpoint_path is not None:
                df.to_parquet(checkpoint_path, index=True)

        unload_model(llm)

        # refresh pending_mask for the next model
        pending_mask = df[out_text_col].isna()

        if checkpoint_path is not None:
            df.to_parquet(checkpoint_path, index=True)

    return df

In [20]:
model_run_cfgs = {
    "Qwen/Qwen2.5-Math-1.5B-Instruct": VLLMModelRunCfg(tensor_parallel_size=1, gpu_memory_utilization=0.60, max_model_len=4096),
    "Qwen/Qwen2.5-Math-7B-Instruct":   VLLMModelRunCfg(tensor_parallel_size=1, gpu_memory_utilization=0.70, max_model_len=4096),
    "Qwen/Qwen2.5-Math-72B-Instruct":  VLLMModelRunCfg(tensor_parallel_size=2, gpu_memory_utilization=0.92, max_model_len=4096),
}

In [21]:
routing_dataset_df.head()

Unnamed: 0,idx,dataset,prompt_scored,formatted,score_raw,score,layer,pos,original_solution,Year,Problem Number,route_to
830,830,gneubig/aime-1983-2024,Zou and Chou are practicing their $100$ -meter...,<|im_start|>system\nPlease reason step by step...,0.956127,0.722346,18,-1,97,2021,1,Qwen/Qwen2.5-Math-7B-Instruct
70,70,gneubig/aime-1983-2024,Compute \[\frac{(10^4+324)(22^4+324)(34^4+324)...,<|im_start|>system\nPlease reason step by step...,-0.575689,0.359925,18,-1,373,1987,14,Qwen/Qwen2.5-Math-72B-Instruct
631,631,gneubig/aime-1983-2024,"Let $A=\{1,2,3,4\}$ , and $f$ and $g$ be rando...",<|im_start|>system\nPlease reason step by step...,0.533384,0.630272,18,-1,453,2014,12,Qwen/Qwen2.5-Math-7B-Instruct
506,506,gneubig/aime-1983-2024,The sequence $(a_n)$ satisfies $a_0=0$ and $a_...,<|im_start|>system\nPlease reason step by step...,-0.070733,0.482324,18,-1,983,2009,14,Qwen/Qwen2.5-Math-72B-Instruct
704,704,gneubig/aime-1983-2024,Triangle $ABC$ is inscribed in circle $\omega$...,<|im_start|>system\nPlease reason step by step...,-1.206015,0.230407,18,-1,43,2016,10,Qwen/Qwen2.5-Math-72B-Instruct


In [22]:
# 3) Run inference and write results back into the SAME df
routing_dataset_df = run_routed_vllm_inference(
    routing_dataset_df,
    route_col="route_to",
    prompt_col="prompt_scored",
    out_text_col="routed_response_text",
    input_num_tokens_col='input_num_tokens',
    temperature=TEMPERATURE,
    max_tokens=3000,
    batch_size=16,  # raise if stable, lower if OOM
    checkpoint_path="aime_routed_outputs.parquet",
    pricing_config=SIMPLE_MODEL_POOL_CONFIG,
    model_run_cfgs=model_run_cfgs,
)

Routes to run: ['Qwen/Qwen2.5-Math-7B-Instruct', 'Qwen/Qwen2.5-Math-72B-Instruct', 'Qwen/Qwen2.5-Math-1.5B-Instruct']

=== Running model: Qwen/Qwen2.5-Math-7B-Instruct | rows: 25 ===
vLLM cfg: VLLMModelRunCfg(tensor_parallel_size=1, gpu_memory_utilization=0.7, max_model_len=4096)
INFO 12-26 19:51:49 [utils.py:326] non-default args: {'model': 'Qwen/Qwen2.5-Math-7B-Instruct', 'max_model_len': 4096, 'gpu_memory_utilization': 0.7, 'disable_log_stats': True}


INFO 12-26 19:51:58 [__init__.py:711] Resolved architecture: Qwen2ForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 12-26 19:51:59 [__init__.py:1750] Using max model len 4096


2025-12-26 19:52:00,356	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 12-26 19:52:00 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=16384.
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:01 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:01 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='Qwen/Qwen2.5-Math-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-Math-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observab

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.05it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.01it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  1.95it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.91it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.94it/s]
[1;36m(EngineCore_0 pid=3059686)[0;0m 


[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:09 [default_loader.py:262] Loading weights took 2.12 seconds
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:10 [gpu_model_runner.py:2007] Model loading took 14.2419 GiB and 2.782169 seconds
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:15 [backends.py:548] Using cache directory: /home/lina4335/.cache/vllm/torch_compile_cache/4858120f0f/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:15 [backends.py:559] Dynamo bytecode transform time: 5.47 s
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:19 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.199 s
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:19 [monitor.py:34] torch.compile takes 5.47 s in total
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:20 [gpu_worker.py:276] Available KV cache memory: 35.47 GiB
[1;36m(EngineCore_0 pid=3059686)

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:01<00:00, 37.91it/s]


[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:23 [gpu_model_runner.py:2708] Graph capturing finished in 2 secs, took 0.64 GiB
[1;36m(EngineCore_0 pid=3059686)[0;0m INFO 12-26 19:52:23 [core.py:214] init engine (profile, create kv cache, warmup model) took 12.93 seconds
INFO 12-26 19:52:23 [llm.py:298] Supported_tasks: ['generate']


Adding requests: 100%|██████████| 16/16 [00:00<00:00, 2175.89it/s]0:00<?, ?it/s]
Processed prompts: 100%|██████████| 16/16 [00:20<00:00,  1.29s/it, est. speed input: 112.31 toks/s, output: 1017.09 toks/s]
Adding requests: 100%|██████████| 9/9 [00:00<00:00, 2406.06it/s][00:20<00:20, 20.64s/it]
Processed prompts: 100%|██████████| 9/9 [00:20<00:00,  2.24s/it, est. speed input: 57.88 toks/s, output: 605.29 toks/s]
Inferencing Qwen/Qwen2.5-Math-7B-Instruct: 100%|██████████| 2/2 [00:40<00:00, 20.45s/it]



=== Running model: Qwen/Qwen2.5-Math-72B-Instruct | rows: 66 ===
vLLM cfg: VLLMModelRunCfg(tensor_parallel_size=2, gpu_memory_utilization=0.92, max_model_len=4096)
INFO 12-26 19:53:08 [utils.py:326] non-default args: {'model': 'Qwen/Qwen2.5-Math-72B-Instruct', 'max_model_len': 4096, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.92, 'max_num_batched_tokens': 8192, 'max_num_seqs': 64, 'disable_log_stats': True}
INFO 12-26 19:53:09 [__init__.py:711] Resolved architecture: Qwen2ForCausalLM
INFO 12-26 19:53:09 [__init__.py:1750] Using max model len 4096
INFO 12-26 19:53:10 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 12-26 19:53:17 [__init__.py:241] Automatically detected platform cuda.
[1;36m(EngineCore_0 pid=3062104)[0;0m INFO 12-26 19:53:19 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=3062104)[0;0m INFO 12-26 19:53:19 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='Qwen/Qwen2

Loading safetensors checkpoint shards:   0% Completed | 0/37 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   3% Completed | 1/37 [00:00<00:22,  1.63it/s]
Loading safetensors checkpoint shards:   5% Completed | 2/37 [00:01<00:20,  1.68it/s]
Loading safetensors checkpoint shards:   8% Completed | 3/37 [00:01<00:19,  1.74it/s]
Loading safetensors checkpoint shards:  11% Completed | 4/37 [00:02<00:20,  1.61it/s]
Loading safetensors checkpoint shards:  14% Completed | 5/37 [00:03<00:20,  1.54it/s]
Loading safetensors checkpoint shards:  16% Completed | 6/37 [00:03<00:19,  1.56it/s]
Loading safetensors checkpoint shards:  19% Completed | 7/37 [00:04<00:18,  1.66it/s]
Loading safetensors checkpoint shards:  22% Completed | 8/37 [00:04<00:16,  1.72it/s]
Loading safetensors checkpoint shards:  24% Completed | 9/37 [00:05<00:16,  1.66it/s]
Loading safetensors checkpoint shards:  27% Completed | 10/37 [00:05<00:14,  1.83it/s]
Loading safetensors checkpoint shards:  30% Completed | 11/37

[1;36m(VllmWorker TP1 pid=3062570)[0;0m INFO 12-26 19:53:53 [default_loader.py:262] Loading weights took 21.39 seconds


Loading safetensors checkpoint shards:  97% Completed | 36/37 [00:21<00:00,  1.64it/s]


[1;36m(VllmWorker TP1 pid=3062570)[0;0m INFO 12-26 19:53:54 [gpu_model_runner.py:2007] Model loading took 67.7935 GiB and 22.137338 seconds


Loading safetensors checkpoint shards: 100% Completed | 37/37 [00:21<00:00,  1.66it/s]
Loading safetensors checkpoint shards: 100% Completed | 37/37 [00:21<00:00,  1.70it/s]
[1;36m(VllmWorker TP0 pid=3062569)[0;0m 


[1;36m(VllmWorker TP0 pid=3062569)[0;0m INFO 12-26 19:53:54 [default_loader.py:262] Loading weights took 21.76 seconds
[1;36m(VllmWorker TP0 pid=3062569)[0;0m INFO 12-26 19:53:54 [gpu_model_runner.py:2007] Model loading took 67.7935 GiB and 22.839806 seconds
[1;36m(VllmWorker TP1 pid=3062570)[0;0m INFO 12-26 19:54:06 [backends.py:548] Using cache directory: /home/lina4335/.cache/vllm/torch_compile_cache/9b2b1d4bcb/rank_1_0/backbone for vLLM's torch.compile
[1;36m(VllmWorker TP1 pid=3062570)[0;0m INFO 12-26 19:54:06 [backends.py:559] Dynamo bytecode transform time: 12.04 s
[1;36m(VllmWorker TP0 pid=3062569)[0;0m INFO 12-26 19:54:07 [backends.py:548] Using cache directory: /home/lina4335/.cache/vllm/torch_compile_cache/9b2b1d4bcb/rank_0_0/backbone for vLLM's torch.compile
[1;36m(VllmWorker TP0 pid=3062569)[0;0m INFO 12-26 19:54:07 [backends.py:559] Dynamo bytecode transform time: 12.78 s
[1;36m(VllmWorker TP1 pid=3062570)[0;0m INFO 12-26 19:54:16 [backends.py:161] Directly 

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 19/19 [00:01<00:00, 15.13it/s]


[1;36m(VllmWorker TP1 pid=3062570)[0;0m INFO 12-26 19:54:23 [custom_all_reduce.py:196] Registering 3059 cuda graph addresses
[1;36m(VllmWorker TP0 pid=3062569)[0;0m INFO 12-26 19:54:23 [custom_all_reduce.py:196] Registering 3059 cuda graph addresses
[1;36m(VllmWorker TP1 pid=3062570)[0;0m INFO 12-26 19:54:23 [gpu_model_runner.py:2708] Graph capturing finished in 2 secs, took 0.44 GiB
[1;36m(VllmWorker TP0 pid=3062569)[0;0m INFO 12-26 19:54:23 [gpu_model_runner.py:2708] Graph capturing finished in 2 secs, took 0.44 GiB
[1;36m(EngineCore_0 pid=3062104)[0;0m INFO 12-26 19:54:23 [core.py:214] init engine (profile, create kv cache, warmup model) took 28.76 seconds
INFO 12-26 19:54:24 [llm.py:298] Supported_tasks: ['generate']


Adding requests: 100%|██████████| 16/16 [00:00<00:00, 2306.62it/s]00:00<?, ?it/s]
Processed prompts: 100%|██████████| 16/16 [01:17<00:00,  4.83s/it, est. speed input: 31.27 toks/s, output: 220.83 toks/s]
Adding requests: 100%|██████████| 16/16 [00:00<00:00, 2020.32it/s]01:17<05:09, 77.33s/it]
Processed prompts: 100%|██████████| 16/16 [00:48<00:00,  3.01s/it, est. speed input: 62.48 toks/s, output: 285.15 toks/s]
Adding requests: 100%|██████████| 16/16 [00:00<00:00, 3594.09it/s]02:05<03:00, 60.18s/it]
Processed prompts: 100%|██████████| 16/16 [01:29<00:00,  5.58s/it, est. speed input: 25.98 toks/s, output: 231.79 toks/s]
Adding requests: 100%|██████████| 16/16 [00:00<00:00, 3263.26it/s]03:34<02:26, 73.48s/it]
Processed prompts: 100%|██████████| 16/16 [01:29<00:00,  5.57s/it, est. speed input: 30.69 toks/s, output: 213.89 toks/s]
Adding requests: 100%|██████████| 2/2 [00:00<00:00, 2243.54it/s] [05:03<01:19, 79.68s/it]
Processed prompts: 100%|██████████| 2/2 [00:38<00:00, 19.15s/it, est. 

[1;36m(VllmWorker TP0 pid=3062569)[0;0m INFO 12-26 20:00:06 [multiproc_executor.py:520] Parent process exited, terminating worker
[1;36m(VllmWorker TP1 pid=3062570)[0;0m INFO 12-26 20:00:06 [multiproc_executor.py:520] Parent process exited, terminating worker

=== Running model: Qwen/Qwen2.5-Math-1.5B-Instruct | rows: 9 ===
vLLM cfg: VLLMModelRunCfg(tensor_parallel_size=1, gpu_memory_utilization=0.6, max_model_len=4096)
INFO 12-26 20:00:10 [utils.py:326] non-default args: {'model': 'Qwen/Qwen2.5-Math-1.5B-Instruct', 'max_model_len': 4096, 'gpu_memory_utilization': 0.6, 'disable_log_stats': True}
INFO 12-26 20:00:11 [__init__.py:711] Resolved architecture: Qwen2ForCausalLM
INFO 12-26 20:00:11 [__init__.py:1750] Using max model len 4096
INFO 12-26 20:00:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 12-26 20:00:20 [__init__.py:241] Automatically detected platform cuda.
[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:22 [core.py:636] 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.07it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.07it/s]
[1;36m(EngineCore_0 pid=3075198)[0;0m 


[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:26 [default_loader.py:262] Loading weights took 0.59 seconds
[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:26 [gpu_model_runner.py:2007] Model loading took 2.8798 GiB and 1.293867 seconds
[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:32 [backends.py:548] Using cache directory: /home/lina4335/.cache/vllm/torch_compile_cache/63465de3ab/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:32 [backends.py:559] Dynamo bytecode transform time: 5.14 s
[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:35 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.149 s
[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:36 [monitor.py:34] torch.compile takes 5.14 s in total
[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:36 [gpu_worker.py:276] Available KV cache memory: 38.98 GiB
[1;36m(EngineCore_0 pid=3075198)[

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:01<00:00, 48.52it/s]


[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:38 [gpu_model_runner.py:2708] Graph capturing finished in 2 secs, took 0.58 GiB
[1;36m(EngineCore_0 pid=3075198)[0;0m INFO 12-26 20:00:38 [core.py:214] init engine (profile, create kv cache, warmup model) took 11.87 seconds
INFO 12-26 20:00:39 [llm.py:298] Supported_tasks: ['generate']


Adding requests: 100%|██████████| 9/9 [00:00<00:00, 2547.32it/s]1 [00:00<?, ?it/s]
Processed prompts: 100%|██████████| 9/9 [00:03<00:00,  2.91it/s, est. speed input: 337.52 toks/s, output: 1898.57 toks/s]
Inferencing Qwen/Qwen2.5-Math-1.5B-Instruct: 100%|██████████| 1/1 [00:03<00:00,  3.15s/it]


In [15]:
ROUTED_DF = pd.read_parquet("gneubig_aime-1983-2024_routed_final.parquet")

In [16]:
# utils.verification_math import compute_score, extract_solution

In [29]:
ROUTED_DF["routed_is_correct"] = ROUTED_DF.apply(lambda row: compute_score(row["routed_response_text"], row["original_solution"]), axis=1)

In [30]:
ROUTED_DF["routed_is_correct"].mean()

0.3322615219721329