In [1]:
import pandas as pd
import os
import sys
import json
import base64
import zlib
import pickle
import threading


from pathlib import Path
from datasets import load_dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

repo_root = Path.cwd().parent.parent
sys.path.insert(0, str(repo_root))

from thom_replication.utils.code_sandbox_verification import compute_score

  from .autonotebook import tqdm as notebook_tqdm


INFO 01-05 02:04:52 [__init__.py:241] Automatically detected platform cuda.


In [2]:
# Set Sandbox Fusion URL (make sure your sandbox fusion service is running)
os.environ["SANDBOX_FUSION_URL"] = "http://localhost:8080/run_code"
print(f"Sandbox Fusion URL: {os.environ['SANDBOX_FUSION_URL']}")

Sandbox Fusion URL: http://localhost:8080/run_code


In [3]:
def format_test_case(test_case):
    """
    Transform test_case from list format to dict format expected by compute_score.
    
    Converts: [{"input": "...", "output": "...", ...}]
    To:       {"inputs": [...], "outputs": [...]}
    """
    # If it's a JSON string, parse it first
    if isinstance(test_case, str):
        try:
            test_case = json.loads(test_case)
        except json.JSONDecodeError:
            raise ValueError(f"Invalid JSON string: {test_case[:100]}")

    # Now convert from list format to dict format
    if isinstance(test_case, list):
        inputs = [tc["input"] for tc in test_case]
        outputs = [tc["output"] for tc in test_case]
        test_case_formatted = {"inputs": inputs, "outputs": outputs}
    elif isinstance(test_case, dict):
        # Already in dict format
        test_case_formatted = test_case
    else:
        raise ValueError(f"Unexpected test_case type: {type(test_case)}")

    return test_case_formatted

In [53]:
def load_encoded_test_case(encoded_test_case):
    """
    Decode a test case that has been base64 encoded, zlib compressed, and pickled.
    
    Args:
        encoded_test_case: String containing base64-encoded, zlib-compressed, pickled data
        
    Returns:
        Decoded test case (usually a list of dicts with 'input' and 'output' keys)
    """
    try:
        # Decode from base64
        decoded = base64.b64decode(encoded_test_case)
        # print(f"✓ Base64 decoded: {len(decoded)} bytes")
        
        # Decompress with zlib
        decompressed = zlib.decompress(decoded)
        # print(f"✓ Zlib decompressed: {len(decompressed)} bytes")
        
        # Unpickle
        test_case = pickle.loads(decompressed)
        print(f"✓ Unpickled: {type(test_case)}")
        return test_case
    except Exception as e:
        print(f"✗ Error decoding test case:")
        print(f"  Type: {type(e).__name__}")
        print(f"  Message: {e}")
        import traceback
        traceback.print_exc()
        raise

def decode_public_test_case(public_test_case_str):
    """
    Decode public test cases from JSON string format.
    
    Args:
        public_test_case_str: JSON string containing test cases
        
    Returns:
        List of test case dicts with 'input' and 'output' keys
    """
    try:
        test_cases = json.loads(public_test_case_str)
        test_cases = format_test_case(test_cases)
        return test_cases
    except json.JSONDecodeError as e:
        print(f"✗ Error decoding public test case: {e}")
        raise

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["TOKENIZERS_PARALLELISM"]="false"

In [6]:
# MODEL_TO_TEST = "openai/gpt-oss-20b"
MODEL_TO_TEST = "Qwen/Qwen2.5-Coder-7B"

In [7]:
model = LLM(model=MODEL_TO_TEST, gpu_memory_utilization=0.7)

INFO 01-05 02:04:53 [utils.py:326] non-default args: {'model': 'Qwen/Qwen2.5-Coder-7B', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True}
INFO 01-05 02:05:01 [__init__.py:711] Resolved architecture: Qwen2ForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 01-05 02:05:02 [__init__.py:1750] Using max model len 32768


2026-01-05 02:05:03,121	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-05 02:05:03 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=16384.
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:04 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:04 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='Qwen/Qwen2.5-Coder-7B', speculative_config=None, tokenizer='Qwen/Qwen2.5-Coder-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=Ob

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  1.74it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.69it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  2.02it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.69it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.83it/s]
[1;36m(EngineCore_0 pid=2734896)[0;0m 


[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:11 [default_loader.py:262] Loading weights took 2.32 seconds
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:12 [gpu_model_runner.py:2007] Model loading took 14.2488 GiB and 3.263272 seconds
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:18 [backends.py:548] Using cache directory: /home/lina4335/.cache/vllm/torch_compile_cache/f29da84e97/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:18 [backends.py:559] Dynamo bytecode transform time: 5.57 s
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:21 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.943 s
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:21 [monitor.py:34] torch.compile takes 5.57 s in total
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:22 [gpu_worker.py:276] Available KV cache memory: 35.46 GiB
[1;36m(EngineCore_0 pid=2734896)

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:01<00:00, 36.84it/s]


[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:25 [gpu_model_runner.py:2708] Graph capturing finished in 2 secs, took 0.64 GiB
[1;36m(EngineCore_0 pid=2734896)[0;0m INFO 01-05 02:05:25 [core.py:214] init engine (profile, create kv cache, warmup model) took 12.69 seconds
INFO 01-05 02:05:25 [llm.py:298] Supported_tasks: ['generate']


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_TEST)

In [9]:
lcb_codegen = load_dataset("livecodebench/code_generation_lite", version_tag="release_v4")

In [10]:
# CODING_SYSTEM_PROMPT = "You are a helpful coding assistant. Please reason step by step and end your answer with a python code block of the neccesary solution."
# CODING_SYSTEM_PROMPT = "You are a helpful coding assistant. Please reason step by step and end your answer with a python code block of the neccesary solution."
# CODING_SYSTEM_PROMPT = "You are a helpful coding assistant."
# CODING_SYSTEM_PROMPT = """You are an expert python competitive programmar and your goal is to construct input-generators for testing programming contest problems. You will write relevant generators and finally construct a‘construct_inputs‘ function that returns a list of diverse inputs sampled from the generator. Remember to strictly follow the instructions and constraints present in the problem statement"""
CODING_SYSTEM_PROMPT = (
  "You are an expert Python programmer. You will be given a question "
  "(problem specification) and will generate a correct Python program that "
  "matches the specification and passes all tests. You will NOT return anything "
  "except for the program."
)

PROBLEM_STATEMENT = """### Question:
{PROMPT}

### Format:
Return ONLY the program in a single Python markdown code block.

```python
# YOUR CODE HERE
```"""

In [11]:
def format_prompts_for_eval(prompt):
    sample_messages=[{"role": "system", "content": CODING_SYSTEM_PROMPT},{"role": "user", "content": PROBLEM_STATEMENT.format(PROMPT=prompt)}]
    templated_prompt = tokenizer.apply_chat_template(sample_messages, tokenize=False, add_generation_prompt=True)
    # added for gpt-oss models
    templated_prompt = templated_prompt.replace("Reasoning: medium", "Reasoning: high")

    return templated_prompt

In [12]:
lcb_codegen['test']

Dataset({
    features: ['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata'],
    num_rows: 713
})

In [13]:
CHOSEN_PROBLEM_IDX = 5
CHOSEN_PROBLEM_IDX = 220

In [14]:
test_case = lcb_codegen["test"][CHOSEN_PROBLEM_IDX]["private_test_cases"]
public_test_case = lcb_codegen["test"][CHOSEN_PROBLEM_IDX]["public_test_cases"]
sample_prob = lcb_codegen["test"][CHOSEN_PROBLEM_IDX]['question_content']

templated_sample_prob = format_prompts_for_eval(sample_prob)

print(templated_sample_prob)
print("=" * 80)

<|im_start|>system
You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.<|im_end|>
<|im_start|>user
### Question:
Takahashi has decided to enjoy a wired full-course meal consisting of N courses in a restaurant.
The i-th course is:

- if X_i=0, an antidotal course with a tastiness of Y_i;
- if X_i=1, a poisonous course with a tastiness of Y_i.

When Takahashi eats a course, his state changes as follows:  

- Initially, Takahashi has a healthy stomach.
- When he has a healthy stomach,
- if he eats an antidotal course, his stomach remains healthy;
- if he eats a poisonous course, he gets an upset stomach.


- When he has an upset stomach,
- if he eats an antidotal course, his stomach becomes healthy;
- if he eats a poisonous course, he dies.



The meal progresses as follows.

- Repeat the following process

In [15]:
test_case_formatted = load_encoded_test_case(test_case)
test_case_formatted = format_test_case(test_case_formatted)

✓ Unpickled: <class 'str'>


In [16]:
# params = SamplingParams(temperature=1.0, max_tokens=16384, top_p=1.0, top_k=0)
params = SamplingParams(temperature=0.2, max_tokens=1200, top_p=0.95)

In [17]:
outputs = model.generate(templated_sample_prob, sampling_params=params)
response = outputs[0].outputs[0].text
print("=" * 80)
print("GENERATED CODE:")
print("=" * 80)
print(response)
print("=" * 80)

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 332.96it/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.52s/it, est. speed input: 432.31 toks/s, output: 145.03 toks/s]

GENERATED CODE:
### Answer:
```python
def max_tastiness(N, courses):
    # Initialize the maximum tastiness to 0
    max_tastiness = 0
    
    # Initialize the current tastiness to 0
    current_tastiness = 0
    
    # Initialize the current state to "healthy"
    current_state = "healthy"
    
    # Iterate through the courses
    for i in range(N):
        # Get the X and Y values for the current course
        X, Y = courses[i]
        
        # If the current state is "healthy"
        if current_state == "healthy":
            # If the current course is antidotal
            if X == 0:
                # Add the tastiness of the current course to the current tastiness
                current_tastiness += Y
            # If the current course is poisonous
            else:
                # Set the current state to "upset"
                current_state = "upset"
        # If the current state is "upset"
        else:
            # If the current course is antidotal
            if




In [18]:
# Reload the module to get the updated version
import importlib
import thom_replication.utils.LiveCodeBench.utils
importlib.reload(thom_replication.utils.LiveCodeBench.utils)
from thom_replication.utils.LiveCodeBench.utils import check_correctness as lcb_check_correctness

In [19]:
# Execute code via Sandbox Fusion
results, metadata_list = lcb_check_correctness(
    in_outs=test_case_formatted,
    generation=response,
    timeout=15,
    debug=True,
    memory_limit_mb=1024,
    language="python"
)

# Calculate pass rate
passed = sum(1 for r in results if r is True)
total = len(results)
pass_rate = passed / total if total > 0 else 0

print("=" * 80)
print(f"SANDBOX FUSION EXECUTION RESULTS:")
print("=" * 80)
print(f"Pass Rate: {pass_rate * 100:.1f}% ({passed}/{total})")
print(f"Passed cases: {passed}")
print(f"Failed cases: {total - passed}")
print(f"Sample results (first 5): {results[:5]}")
print("=" * 80)

SANDBOX FUSION EXECUTION RESULTS:
Pass Rate: 0.0% (0/12)
Passed cases: 0
Failed cases: 12
Sample results (first 5): [False, False, False, False, False]


In [20]:
PUBLIC_TEST_CASES_LIST = lcb_codegen["test"]["public_test_cases"]
PRIVATE_TEST_CASES_LIST = lcb_codegen["test"]["private_test_cases"]
FULL_PROMPT_LIST = lcb_codegen["test"]["question_content"]
DIFFICULTY_LIST = lcb_codegen["test"]["difficulty"]
CONTEST_ID_LIST = lcb_codegen["test"]["contest_id"]
QUESTION_ID_LIST = lcb_codegen["test"]["question_id"]

In [21]:
# Create a comprehensive DataFrame with all dataset information
evaluation_df = pd.DataFrame({
    'problem_idx': range(len(FULL_PROMPT_LIST)),
    'question_content': FULL_PROMPT_LIST,
    'difficulty': DIFFICULTY_LIST,
    'contest_id': CONTEST_ID_LIST,
    'question_id': QUESTION_ID_LIST,
    'public_test_cases': PUBLIC_TEST_CASES_LIST,
    'private_test_cases': PRIVATE_TEST_CASES_LIST
})

print(f"Dataset shape: {evaluation_df.shape}")
print("\nDataFrame columns:")
print(evaluation_df.columns.tolist())
print("\nFirst few rows:")
evaluation_df.head()

Dataset shape: (713, 7)

DataFrame columns:
['problem_idx', 'question_content', 'difficulty', 'contest_id', 'question_id', 'public_test_cases', 'private_test_cases']

First few rows:


Unnamed: 0,problem_idx,question_content,difficulty,contest_id,question_id,public_test_cases,private_test_cases
0,0,There are three cards with letters $\texttt{a}...,easy,1873,1873_A,"[{""input"": ""6\nabc\nacb\nbac\nbca\ncab\ncba\n""...",eJxrYJmaz8gABhEZQEZ0tVJmXkFpiZKVgpJhTF5iUnJMnp...
1,1,Slavic is preparing a present for a friend's b...,easy,1873,1873_B,"[{""input"": ""4\n4\n2 2 1 2\n3\n0 1 2\n5\n4 3 2 ...",eJyUvb2uNVvSlIuBi8UNDLWNUf9VgytB4jXBwOmDRGMcHR...
2,2,You are given a strip of paper $s$ that is $n$...,easy,1873,1873_D,"[{""input"": ""8\n6 3\nWBWWWB\n7 3\nWWBWBWW\n5 4\...",eJyUvb2SY8vWXSdDplz6O65No/APyMyXSEaoTNGgU2IELw...
3,3,"You are given a string $s$ of length $n$, cons...",medium,1883,1883_B,"[{""input"": ""14\n1 0\na\n2 0\nab\n2 1\nba\n3 1\...",eJylkUELwiAYhiP6FZ3M84i5Glvdu9ahS5EdpnObG9hg7h...
4,4,"You are given an array of integers $a_1, a_2, ...",medium,1883,1883_C,"[{""input"": ""15\n2 5\n7 3\n3 3\n7 4 1\n5 2\n9 7...",eJyc3TuOLluWJGYKHAaFhZJbaH9ud46EAFNkC60kG+hqgS...


In [22]:
evaluation_df["formatted_prompt"] = evaluation_df["question_content"].apply(format_prompts_for_eval)

In [23]:
params = SamplingParams(temperature=0.2, max_tokens=1200, top_p=0.95, n=10)

In [41]:
# Batch generate n responses per question
print("Starting batch generation...")
print(f"Total problems: {len(evaluation_df)}")
print(f"Generating {params.n} responses per problem")

# Generate all outputs in batches
batch_size = 256  # Adjust based on GPU memory
all_responses = []  # Will store lists of n responses

for batch_start in range(0, len(evaluation_df), batch_size):
    batch_end = min(batch_start + batch_size, len(evaluation_df))
    batch_prompts = evaluation_df["formatted_prompt"].iloc[batch_start:batch_end].tolist()
    
    print(f"Processing batch {batch_start}-{batch_end}...")
    
    # Generate for entire batch
    batch_outputs = model.generate(batch_prompts, sampling_params=params)
    
    # Extract ALL n responses for each prompt
    for output in batch_outputs:
        # output.outputs is a list of n different generated texts
        responses_for_prompt = [text.text for text in output.outputs]
        all_responses.append(responses_for_prompt)

print(f"\n✓ Generated {len(all_responses)} sets of responses")
print(f"✓ Each set contains {len(all_responses[0])} responses")

# # Add generated responses to dataframe
evaluation_df["generated_code"] = all_responses

# Convert to JSON objects - store as list of codes
evaluation_df["generated_sols"] = evaluation_df["generated_code"].apply(
    lambda x: json.dumps({"codes": x}) if isinstance(x, list) else x
)

# Save to parquet
evaluation_df.to_parquet(f"LCB_df_{MODEL_TO_TEST.replace('/', '-')}.parquet")

print(f"\n✓ Added generated_sols column to dataframe")
print(f"Dataframe shape: {evaluation_df.shape}")
print(f"\nFirst problem - Generated {len(evaluation_df['generated_code'].iloc[0])} solutions")
print(f"First solution preview:")
print(evaluation_df["generated_code"].iloc[0][0][:200])

Starting batch generation...
Total problems: 713
Generating 10 responses per problem
Processing batch 0-256...


Adding requests: 100%|██████████| 256/256 [00:00<00:00, 747.76it/s]
Processed prompts: 100%|██████████| 2560/2560 [01:50<00:00, 23.21it/s, est. speed input: 12239.60 toks/s, output: 12300.62 toks/s]


Processing batch 256-512...


Adding requests: 100%|██████████| 256/256 [00:00<00:00, 740.79it/s]
Processed prompts: 100%|██████████| 2560/2560 [02:55<00:00, 14.62it/s, est. speed input: 8391.22 toks/s, output: 12432.79 toks/s] 


Processing batch 512-713...


Adding requests: 100%|██████████| 201/201 [00:00<00:00, 799.80it/s]
Processed prompts: 100%|██████████| 2010/2010 [01:59<00:00, 16.84it/s, est. speed input: 8989.64 toks/s, output: 12675.23 toks/s]



✓ Generated 713 sets of responses
✓ Each set contains 10 responses

✓ Added generated_sols column to dataframe
Dataframe shape: (713, 10)

First problem - Generated 10 solutions
First solution preview:
### Answer:
```python
def can_make_abc(cards):
    if cards == "abc":
        return "YES"
    elif cards == "acb":
        return "YES"
    elif cards == "bac":
        return "YES"
    elif cards ==


In [42]:
evaluation_df["generated_sols"].iloc[0]

'{"codes": ["### Answer:\\n```python\\ndef can_make_abc(cards):\\n    if cards == \\"abc\\":\\n        return \\"YES\\"\\n    elif cards == \\"acb\\":\\n        return \\"YES\\"\\n    elif cards == \\"bac\\":\\n        return \\"YES\\"\\n    elif cards == \\"bca\\":\\n        return \\"NO\\"\\n    elif cards == \\"cab\\":\\n        return \\"NO\\"\\n    elif cards == \\"cba\\":\\n        return \\"YES\\"\\n    else:\\n        return \\"NO\\"\\n\\nt = int(input())\\nfor _ in range(t):\\n    cards = input()\\n    print(can_make_abc(cards))\\n```", "### Answer:\\n```python\\ndef can_make_abc(cards):\\n    if cards == \\"abc\\":\\n        return \\"YES\\"\\n    elif cards == \\"acb\\":\\n        return \\"YES\\"\\n    elif cards == \\"bac\\":\\n        return \\"YES\\"\\n    elif cards == \\"bca\\":\\n        return \\"NO\\"\\n    elif cards == \\"cab\\":\\n        return \\"NO\\"\\n    elif cards == \\"cba\\":\\n        return \\"YES\\"\\n    else:\\n        return \\"NO\\"\\n\\nt = int(i

In [37]:
len(evaluation_df)

713

In [26]:
len(evaluation_df)

713

In [27]:
# Save the dataframe with generated solutions
output_csv = "evaluation_results_with_generated_code.csv"
evaluation_df.to_csv(output_csv, index=False)
print(f"✓ Saved full results to {output_csv}")

# Also save as pickle for easy loading
output_pickle = "evaluation_results_with_generated_code.pkl"
evaluation_df.to_pickle(output_pickle)
print(f"✓ Saved full results to {output_pickle}")

# Display summary
print("\nDataFrame Summary:")
print(f"Total problems: {len(evaluation_df)}")
print(f"Columns: {evaluation_df.columns.tolist()}")
print("\nDataframe Info:")
evaluation_df.info()

✓ Saved full results to evaluation_results_with_generated_code.csv
✓ Saved full results to evaluation_results_with_generated_code.pkl

DataFrame Summary:
Total problems: 713
Columns: ['problem_idx', 'question_content', 'difficulty', 'contest_id', 'question_id', 'public_test_cases', 'private_test_cases', 'formatted_prompt', 'generated_code', 'generated_sols']

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 713 entries, 0 to 712
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   problem_idx         713 non-null    int64 
 1   question_content    713 non-null    object
 2   difficulty          713 non-null    object
 3   contest_id          713 non-null    object
 4   question_id         713 non-null    object
 5   public_test_cases   713 non-null    object
 6   private_test_cases  713 non-null    object
 7   formatted_prompt    713 non-null    object
 8   generated_code      713 non-null

In [44]:
gen_code_obj = json.loads(evaluation_df['generated_sols'].iloc[0])

In [45]:
gen_code_obj['codes']

['### Answer:\n```python\ndef can_make_abc(cards):\n    if cards == "abc":\n        return "YES"\n    elif cards == "acb":\n        return "YES"\n    elif cards == "bac":\n        return "YES"\n    elif cards == "bca":\n        return "NO"\n    elif cards == "cab":\n        return "NO"\n    elif cards == "cba":\n        return "YES"\n    else:\n        return "NO"\n\nt = int(input())\nfor _ in range(t):\n    cards = input()\n    print(can_make_abc(cards))\n```',
 '### Answer:\n```python\ndef can_make_abc(cards):\n    if cards == "abc":\n        return "YES"\n    elif cards == "acb":\n        return "YES"\n    elif cards == "bac":\n        return "YES"\n    elif cards == "bca":\n        return "NO"\n    elif cards == "cab":\n        return "NO"\n    elif cards == "cba":\n        return "YES"\n    else:\n        return "NO"\n\nt = int(input())\nfor _ in range(t):\n    cards = input()\n    print(can_make_abc(cards))\n```',
 '### Response:\n```python\ndef can_make_abc(cards):\n    if cards

In [51]:
# Execute code via Sandbox Fusion
results, metadata_list = lcb_check_correctness(
    in_outs=decode_public_test_case(evaluation_df['public_test_cases'].iloc[0]),
    generation=gen_code_obj['codes'][0],
    timeout=15,
    debug=True,
    memory_limit_mb=1024,
    language="python"
)

# Calculate pass rate
passed = sum(1 for r in results if r is True)
total = len(results)
pass_rate = passed / total if total > 0 else 0

print("=" * 80)
print(f"SANDBOX FUSION EXECUTION RESULTS:")
print("=" * 80)
print(f"Pass Rate: {pass_rate * 100:.1f}% ({passed}/{total})")
print(f"Passed cases: {passed}")
print(f"Failed cases: {total - passed}")
print(f"Sample results (first 5): {results[:5]}")
print("=" * 80)

✓ Decoded 1 public test cases
SANDBOX FUSION EXECUTION RESULTS:
Pass Rate: 100.0% (1/1)
Passed cases: 1
Failed cases: 0
Sample results (first 5): [True]


In [52]:
# Evaluate all generated solutions for the full dataframe
print("Starting evaluation of all generated solutions...")
print(f"Total problems: {len(evaluation_df)}")
print(f"Solutions per problem: {params.n}")

# Store results
pass_results_all = []
pass_at_k_results = []

for idx, row in evaluation_df.iterrows():
    if idx % 50 == 0:
        print(f"Evaluating problem {idx}/{len(evaluation_df)}...")
    
    # Get test cases and generated solutions
    public_tests = decode_public_test_case(row['public_test_cases'])
    generated_codes = json.loads(row['generated_sols'])['codes']
    
    # Evaluate each solution
    pass_results = []
    for sol_idx, solution in enumerate(generated_codes):
        try:
            results, _ = lcb_check_correctness(
                in_outs=public_tests,
                generation=solution,
                timeout=10,
                debug=False,
                memory_limit_mb=1024,
                language="python"
            )
            # Check if ALL test cases passed
            all_passed = all(r is True for r in results)
            pass_results.append(all_passed)
        except Exception as e:
            # If evaluation fails, count as failure
            pass_results.append(False)
    
    pass_results_all.append(pass_results)
    
    # Calculate pass@k metrics
    pass_at_k = {}
    for k in range(1, len(generated_codes) + 1):
        # pass@k = True if at least one of first k solutions passes
        pass_at_k[f'pass@{k}'] = any(pass_results[:k])
    
    pass_at_k_results.append(pass_at_k)

print(f"\n✓ Evaluated {len(pass_results_all)} problems")

# Add results to dataframe
evaluation_df['pass_results'] = pass_results_all
evaluation_df['pass_at_k'] = pass_at_k_results

# Extract pass@k columns
for k in range(1, params.n + 1):
    evaluation_df[f'pass@{k}'] = evaluation_df['pass_at_k'].apply(lambda x: x[f'pass@{k}'])

# Calculate additional metrics
evaluation_df['num_passing_solutions'] = evaluation_df['pass_results'].apply(sum)
evaluation_df['best_pass_at_k'] = evaluation_df['pass_results'].apply(
    lambda x: next((i+1 for i, p in enumerate(x) if p), None)
)

print("\n✓ Calculated pass@k metrics")
print(f"\nDataframe columns: {evaluation_df.columns.tolist()}")

# Calculate overall statistics
print("\n" + "="*80)
print("OVERALL PASS@K STATISTICS")
print("="*80)
for k in range(1, params.n + 1):
    pass_rate = evaluation_df[f'pass@{k}'].mean()
    print(f"pass@{k}: {pass_rate * 100:.2f}% ({evaluation_df[f'pass@{k}'].sum()}/{len(evaluation_df)})")

# Save updated dataframe
evaluation_df.to_parquet(f"LCB_df_evaluated_{MODEL_TO_TEST.replace('/', '-')}.parquet")
print(f"\n✓ Saved results to LCB_df_evaluated_{MODEL_TO_TEST.replace('/', '-')}.parquet")

Starting evaluation of all generated solutions...
Total problems: 713
Solutions per problem: 10
Evaluating problem 0/713...
✓ Decoded 1 public test cases
✓ Decoded 1 public test cases
✓ Decoded 1 public test cases
✓ Decoded 1 public test cases
✓ Decoded 1 public test cases
✓ Decoded 1 public test cases
✓ Decoded 1 public test cases
✓ Decoded 1 public test cases
✓ Decoded 1 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Deco

Empty inputs provided.
Empty inputs provided.
Empty inputs provided.
Empty inputs provided.
Empty inputs provided.
Empty inputs provided.
Empty inputs provided.
Empty inputs provided.
Empty inputs provided.
Empty inputs provided.


✓ Decoded 0 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 2 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
Evaluating problem 450/713...
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 1 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
✓ Decoded 3 public test cases
✓ Decoded 2 public test cases
✓ Decoded 3 public test cases
✓ Decoded 

In [54]:
print("\n" + "="*80)
print("OVERALL PASS@K STATISTICS")
print("="*80)
for k in range(1, params.n + 1):
    pass_rate = evaluation_df[f'pass@{k}'].mean()
    print(f"pass@{k}: {pass_rate * 100:.2f}% ({evaluation_df[f'pass@{k}'].sum()}/{len(evaluation_df)})")



OVERALL PASS@K STATISTICS
pass@1: 7.85% (56/713)
pass@2: 11.78% (84/713)
pass@3: 13.18% (94/713)
pass@4: 14.73% (105/713)
pass@5: 15.71% (112/713)
pass@6: 16.55% (118/713)
pass@7: 16.97% (121/713)
pass@8: 18.23% (130/713)
pass@9: 19.07% (136/713)
pass@10: 19.35% (138/713)
