In [1]:
import json
import re
import argparse
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import vllm.envs as envs
from tqdm import tqdm
from utils.parser import extract_answer



In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
if len(available_gpus) == 1:
    envs.VLLM_HOST_IP="0.0.0.0" or "127.0.0.1"

In [3]:
model_path = "/nas/shared/GAIR/ckpts/Qwen2.5/Qwen2.5-7B-Instruct" # Use Qwen2.5-32B-Instruct may be better!

llm = LLM(model=model_path, 
              tensor_parallel_size=len(available_gpus), 
              trust_remote_code=True, 
            #   swap_space=60,
              gpu_memory_utilization=0.96,
              )

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

sampling_params = SamplingParams(temperature=0.0, 
                                  max_tokens=1024, 
                                  n=1,
                                  top_p=1,
                                  )


INFO 12-29 22:30:20 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 12-29 22:30:20 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/nas/shared/GAIR/ckpts/Qwen2.5/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='/nas/shared/GAIR/ckpts/Qwen2.5/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-29 22:30:24 model_runner.py:1077] Loading model weights took 14.2487 GB
INFO 12-29 22:30:26 worker.py:232] Memory profiling results: total_gpu_memory=79.33GiB initial_memory_usage=14.79GiB peak_torch_memory=18.60GiB memory_usage_post_profile=14.81GiB non_torch_memory=0.55GiB kv_cache_size=57.00GiB gpu_memory_utilization=0.96
INFO 12-29 22:30:27 gpu_executor.py:113] # GPU blocks: 66706, # CPU blocks: 4681
INFO 12-29 22:30:27 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 32.57x
INFO 12-29 22:30:29 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-29 22:30:29 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.


In [13]:
with open("./data/gaokao/qwq_gaokao2024_新课标1.jsonl", "r") as f:
    data = [json.loads(l) for l in f]

data[0]

{'question': '1. 已知集合 $A=\\left\\{x \\mid-5<x^3<5\\right\\}, B=\\{-3,-1,0,2,3\\}$, 则 $A \\cap B=$\nA. $\\{-1,0\\}$\nB. $\\{2,3\\}$\nC. $\\{-3,-1,0\\}$\nD. $\\{-1,0,2\\}$',
 'generated_responses': ["So I have this problem here: I need to find the intersection of sets A and B. Set A is defined as all x such that -5 < x^3 < 5, and set B is {-3, -1, 0, 2, 3}. I need to find A intersect B.\n\nFirst, I need to understand what set A contains. It's all the x values where x cubed is greater than -5 and less than 5. So, I need to find all x where -5 < x^3 < 5.\n\nTo get a better sense of this, maybe I should find the cube roots of -5 and 5 to see what range x is in.\n\nThe cube root of -5 is... let's see, (-1.710)^3 is approximately -5, and the cube root of 5 is about 1.710. So, x is between approximately -1.710 and 1.710.\n\nWait, but I should be precise. Let me denote c = cube root of 5, which is roughly 1.710, and -c for negative cube root of 5.\n\nSo, A = {x | -c < x < c}, where c is the cub

#### Model Judge

In [7]:
prompt_batch = []

for example in tqdm(data, total=len(data)):
    problem = example["question"]
    standard_answer = example["gold_answer"]
    generated_responses = example["generated_responses"]
    
    # model-based extract
    generated_answers = example["generated_answers"]
    
    # rule-based extract
    # generated_answers = [extract_answer(r) for r in generated_responses]
    
    # student's final answer v.s. final gold answer
    messages = [
        {
            "role": "system",
            "content": """You are an experienced examiner who evaluates whether a student's answer to a given question is correct. 
Your task is to determine if the student's final answer matches the standard answer provided, based solely on correctness and the question's specific requirements. 
Do not perform any additional calculations or reinterpret the question. Simply compare the student's answer to the standard answer to determine if it satisfies the question's requirements.

Focus strictly on:
1. Understanding the exact requirement of the question.
2. Comparing the student's final answer directly to the provided standard answer.
3. Your task is not to solve the problem but to determine whether the student's answer is correct based on the question's requirements. Avoid any unnecessary analysis, assumptions, or re-solving the problem.

Note:
- For intervals/ranges: The student's answer must cover the EXACT SAME range as the standard answer, NOT just any single value or subset within that range;
- If the standard answer contains multiple solutions connected by "or"/"and", all of them must be listed in the student's answer;
- You must be deterministic - always declare the answer as either CORRECT or WRONG;

Your response must include:
## Analysis
<Provide a brief and direct analysis that compares the student's answer to the standard answer>

## Correctness
<CORRECT/WRONG>"""
        },
        {
            "role": "user",
            "content": f"""Question: {problem}

Standard Answer: {standard_answer}

Student's Final Answer: {generated_answers[0]}"""
        }
    ]
        
    # Apply the chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    ## Open-Source Model
    prompt_batch.append(prompt)
    
    ## API Model
    # prompt_batch.append(messages)
    
# Display the first prompt for debugging
print(prompt_batch[0])

100%|██████████| 19/19 [00:00<00:00, 22112.04it/s]

<|im_start|>system
You are an experienced examiner who evaluates whether a student's answer to a given question is correct. 
Your task is to determine if the student's final answer matches the standard answer provided, based solely on correctness and the question's specific requirements. 
Do not perform any additional calculations or reinterpret the question. Simply compare the student's answer to the standard answer to determine if it satisfies the question's requirements.

Focus strictly on:
1. Understanding the exact requirement of the question.
2. Comparing the student's final answer directly to the provided standard answer.
3. Your task is not to solve the problem but to determine whether the student's answer is correct based on the question's requirements. Avoid any unnecessary analysis, assumptions, or re-solving the problem.

Your response must include:
## Analysis
<Provide a brief and direct analysis that compares the student's answer to the standard answer>

## Correctness
<C




In [8]:
completions = llm.generate(prompt_batch, sampling_params)

Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 19/19 [00:03<00:00,  5.58it/s, est. speed input: 2029.84 toks/s, output: 370.37 toks/s]


In [9]:
import re

def parse_model_output(output):
    # Initialize the result dictionary
    result = {"analysis": None, "correctness": False}
    
    # 改进的正则表达式模式
    analysis_pattern = r"##\s*Analysis\s*(.*?)\s*##\s*Correctness"
    correctness_pattern = r"##\s*Correctness\s*(.*?)$"

    # Extract Analysis
    analysis_match = re.search(analysis_pattern, output, re.DOTALL)
    if analysis_match:
        result["analysis"] = analysis_match.group(1).strip()

    # Extract Correctness
    correctness_match = re.search(correctness_pattern, output, re.DOTALL)
    if correctness_match:
        correctness_value = correctness_match.group(1).strip()
        result["correctness"] = "correct" in correctness_value.lower()
            
    return result["analysis"], result["correctness"]


correct_cnt = 0

for idx, example in enumerate(data):
    problem = example["question"]
    standard_answer = example["gold_answer"]
    generated_responses = example["generated_responses"]
    
    # model-based extract
    generated_answers = example["generated_answers"]
    
    # rule-based extract
    # generated_answers = [extract_answer(r) for r in generated_responses]

    ## Open-Source Model
    output = completions[idx].outputs[0].text
    
    ## API Model
    # output = completions[idx]

    analysis, correctness = parse_model_output(output)
    
    
    if correctness:
        correct_cnt += 1
        
        
    print(f"## ID: {idx + 1}")
    print(f"## Question: {problem}")
    print("--------------------")
    print(f"## Standard Answer: {standard_answer}")
    print(f"## Student's Final Answer: {generated_answers[0]}")
    print("--------------------")
    print(f"## Analysis: {analysis}")
    print(f"## Correctness: {correctness}")
    print("\n\n")
            
            
print(correct_cnt)

## ID: 1
## Question: 1. 已知集合 $A=\left\{x \mid-5<x^3<5\right\}, B=\{-3,-1,0,2,3\}$, 则 $A \cap B=$
A. $\{-1,0\}$
B. $\{2,3\}$
C. $\{-3,-1,0\}$
D. $\{-1,0,2\}$
--------------------
## Standard Answer: A
## Student's Final Answer: \[ \boxed{A} \]
--------------------
## Analysis: The student's answer is \(\boxed{A}\), which corresponds to the set \(\{-1,0\}\). The standard answer also provides the same set, \(\{-1,0\}\).
## Correctness: True



## ID: 2
## Question: 2. 若 $\frac{z}{z-1}=1+i$, 则 $z=$
A. $-1-\mathrm{i}$
B. $-1+\mathrm{i}$
C. $1-\mathrm{i}$
D. $1+\mathrm{i}$
--------------------
## Standard Answer: C
## Student's Final Answer: \[ \boxed{1 - \mathrm{i}} \]
--------------------
## Analysis: The student's final answer is \(1 - \mathrm{i}\), which matches the standard answer provided.
## Correctness: True



## ID: 3
## Question: 3. 已知向量 $\vec{a}=(0,1), \vec{b}=(2, x)$, 若 $\vec{b} \perp(\vec{b}-4 \vec{a})$, 则 $x=$
A. -2
B. -1
C. 1
D. 2
--------------------
## Standard Answer: D
#