In [1]:
import pandas as pd
import os
import sys
import json
import base64
import zlib
import pickle
import threading


from pathlib import Path
from datasets import load_dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

repo_root = Path.cwd().parent.parent
sys.path.insert(0, str(repo_root))

from thom_replication.utils.code_sandbox_verification import compute_score

  from .autonotebook import tqdm as notebook_tqdm


INFO 01-05 00:20:41 [__init__.py:241] Automatically detected platform cuda.


In [2]:
def format_test_case(test_case):
    """
    Transform test_case from list format to dict format expected by compute_score.
    
    Converts: [{"input": "...", "output": "...", ...}]
    To:       {"inputs": [...], "outputs": [...]}
    """
    # If it's a JSON string, parse it first
    if isinstance(test_case, str):
        try:
            test_case = json.loads(test_case)
        except json.JSONDecodeError:
            raise ValueError(f"Invalid JSON string: {test_case[:100]}")

    # Now convert from list format to dict format
    if isinstance(test_case, list):
        inputs = [tc["input"] for tc in test_case]
        outputs = [tc["output"] for tc in test_case]
        test_case_formatted = {"inputs": inputs, "outputs": outputs}
    elif isinstance(test_case, dict):
        # Already in dict format
        test_case_formatted = test_case
    else:
        raise ValueError(f"Unexpected test_case type: {type(test_case)}")

    return test_case_formatted

In [3]:
def load_encoded_test_case(encoded_test_case):
    """
    Decode a test case that has been base64 encoded, zlib compressed, and pickled.
    
    Args:
        encoded_test_case: String containing base64-encoded, zlib-compressed, pickled data
        
    Returns:
        Decoded test case (usually a list of dicts with 'input' and 'output' keys)
    """
    try:
        # Decode from base64
        decoded = base64.b64decode(encoded_test_case)
        # print(f"✓ Base64 decoded: {len(decoded)} bytes")
        
        # Decompress with zlib
        decompressed = zlib.decompress(decoded)
        # print(f"✓ Zlib decompressed: {len(decompressed)} bytes")
        
        # Unpickle
        test_case = pickle.loads(decompressed)
        print(f"✓ Unpickled: {type(test_case)}")
        return test_case
    except Exception as e:
        print(f"✗ Error decoding test case:")
        print(f"  Type: {type(e).__name__}")
        print(f"  Message: {e}")
        import traceback
        traceback.print_exc()
        raise

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["TOKENIZERS_PARALLELISM"]="false"

In [5]:
MODEL_TO_TEST = "openai/gpt-oss-20b"

In [6]:
model = LLM(model=MODEL_TO_TEST, gpu_memory_utilization=0.7)

INFO 01-05 00:20:43 [utils.py:326] non-default args: {'model': 'openai/gpt-oss-20b', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True}
INFO 01-05 00:20:51 [__init__.py:711] Resolved architecture: GptOssForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!
Parse safetensors files: 100%|██████████| 3/3 [00:00<00:00,  9.83it/s]


INFO 01-05 00:20:52 [__init__.py:1750] Using max model len 131072


2026-01-05 00:20:54,084	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-05 00:20:54 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 01-05 00:20:54 [config.py:273] Overriding max cuda graph capture size to 1024 for performance.
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:20:55 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:20:55 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_white

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  33% Completed | 1/3 [00:00<00:00,  2.18it/s]
Loading safetensors checkpoint shards:  67% Completed | 2/3 [00:01<00:00,  1.88it/s]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:01<00:00,  1.80it/s]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:01<00:00,  1.84it/s]
[1;36m(EngineCore_0 pid=2564457)[0;0m 


[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:01 [default_loader.py:262] Loading weights took 1.68 seconds
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:01 [gpu_model_runner.py:2007] Model loading took 13.7194 GiB and 2.538651 seconds
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:06 [backends.py:548] Using cache directory: /home/lina4335/.cache/vllm/torch_compile_cache/45375ed44b/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:06 [backends.py:559] Dynamo bytecode transform time: 4.45 s
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:09 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.875 s
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:09 [monitor.py:34] torch.compile takes 4.45 s in total
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:10 [gpu_worker.py:276] Available KV cache memory: 34.24 GiB
[1;36m(EngineCore_0 pid=2564457)

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 83/83 [00:03<00:00, 26.38it/s]


[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:14 [gpu_model_runner.py:2708] Graph capturing finished in 3 secs, took 0.91 GiB
[1;36m(EngineCore_0 pid=2564457)[0;0m INFO 01-05 00:21:14 [core.py:214] init engine (profile, create kv cache, warmup model) took 12.77 seconds
INFO 01-05 00:21:16 [llm.py:298] Supported_tasks: ['generate']


In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_TEST)

In [8]:
lcb_codegen = load_dataset("livecodebench/code_generation_lite", version_tag="release_v4")

In [9]:
CODING_SYSTEM_PROMPT = "You are a helpful coding assistant. Please reason step by step and end your answer with a python code block of the neccesary solution. "
# CODING_SYSTEM_PROMPT = "You are a helpful coding assistant."

In [10]:
CHOSEN_PROBLEM_IDX = 5
CHOSEN_PROBLEM_IDX = 400

In [11]:
test_case = lcb_codegen["test"][CHOSEN_PROBLEM_IDX]["private_test_cases"]
sample_prob = lcb_codegen["test"][CHOSEN_PROBLEM_IDX]['question_content']

sample_messages=[{"role": "system", "content": CODING_SYSTEM_PROMPT},{"role": "user", "content": sample_prob}]
# sample_messages=[{"role": "user", "content": sample_prob}]

templated_sample_prob = tokenizer.apply_chat_template(sample_messages, tokenize=False, add_generation_prompt=True)
templated_sample_prob = templated_sample_prob.replace("Reasoning: medium", "Reasoning: high")

print(templated_sample_prob)

<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2026-01-05

Reasoning: high

# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions

You are a helpful coding assistant. Please reason step by step and end your answer with a python code block of the neccesary solution. 

<|end|><|start|>user<|message|>You initially have an empty string S.
Additionally, there are bags 1, 2, \dots, N, each containing some strings.
Bag i contains A_i strings S_{i,1}, S_{i,2}, \dots, S_{i,A_i}.
You will repeat the following steps for i = 1, 2, \dots, N:

- Choose and perform one of the following two actions:
- Pay 1 yen, select exactly one string from bag i, and concatenate it to the end of S.
- Do nothing.



Given a string T, find the minimum amount of money required to make the final S equal T.
If there is no way to make the final S equal T, pri

In [12]:
test_case_formatted = load_encoded_test_case(test_case)
test_case_formatted = format_test_case(test_case_formatted)

✓ Unpickled: <class 'str'>


In [13]:
params = SamplingParams(temperature=1.0, max_tokens=16384, top_p=1.0, top_k=0)

In [14]:
outputs = model.generate(templated_sample_prob, sampling_params=params)
response = outputs[0].outputs[0].text
print("=" * 80)
print("GENERATED CODE:")
print("=" * 80)
print(response)
print("=" * 80)

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 459.20it/s]
Processed prompts: 100%|██████████| 1/1 [00:56<00:00, 56.45s/it, est. speed input: 11.66 toks/s, output: 240.61 toks/s]

GENERATED CODE:
analysisWe need to parse: We start with empty string S. There are N bags numbered 1..N. For each bag i we may choose to pay 1 yen and select exactly one string from that bag, concatenate it to end of S; or do nothing. We must produce final S equal to given target T. We can choose at most once per bag: either take one string or skip. We cannot re-use a string after taking it; only one from each bag at most. We can choose to skip many; we can only pay if we select one. We need to minimize cost (# selections). Essentially, we need to choose at most one string per bag such that concatenation of selected strings in order (bag index order) equals T, with minimal selections amount.

Because we may skip some bags, but the order of strings in final S is the increasing order of bag indices where we selected. So T is partitioned into substrings which correspond to selections: for each selected bag i, we select one string from bag i, and the concatenated sequence must equal T exact




In [15]:
import json
import re

solution = response
if "```python" in response:
    solution = response.split("```python")[-1].split("```")[0]
elif "```" in response:
    parts = response.split("```")
    if len(parts) >= 2:
        solution = parts[1]
        if "\n" in solution:
            first_line, rest = solution.split("\n", 1)
            if first_line.strip().isalpha():
                solution = rest

# Remove any example usage or if __name__ == "__main__" blocks
if 'if __name__' in solution:
    solution = solution.split('if __name__')[0]

# Remove trailing comments or examples
lines = solution.split('\n')
cleaned_lines = []
for line in lines:
    if line.strip().startswith('# Example') or line.strip().startswith('# -'):
        break
    cleaned_lines.append(line)
solution = '\n'.join(cleaned_lines).rstrip()

# Dynamically detect the function name from the solution
# Look for "def function_name(" pattern
function_match = re.search(r'def\s+(\w+)\s*\(', solution)
if function_match:
    function_name = function_match.group(1)
    print(f"Detected function name: {function_name}")
else:
    function_name = "solve"  # fallback
    print("Warning: Could not detect function name, using 'solve'")

# Wrap the solution with I/O handling to read from stdin and output to stdout
wrapped_solution = f"""```python
import json

{solution}

# Read input from stdin
cost = json.loads(input())
time = json.loads(input())

# Call the solution function
result = {function_name}(cost, time)
print(result)
```"""

print("Cleaned and wrapped solution:")
print("=" * 80)
print(wrapped_solution)
print("=" * 80)

Detected function name: solve
Cleaned and wrapped solution:
```python
import json


import sys

def solve() -> None:
    data = sys.stdin.read().split()
    if not data:
        return

    t = data[0]             # target string
    n = int(data[1])        # number of bags
    idx = 2

    bags = []               # bags[i] : list of strings in bag i (0‑based)
    for _ in range(n):
        a = int(data[idx]); idx += 1
        strings = data[idx:idx + a]
        idx += a
        bags.append(strings)

    m = len(t)
    INF = 10**9

    dp = [INF] * (m + 1)
    dp[0] = 0

    for bag in bags:
        next_dp = dp[:]            # skip case
        for pos in range(m + 1):
            if dp[pos] == INF:
                continue
            for s in bag:
                l = len(s)
                if pos + l <= m and t[pos:pos + l] == s:
                    nxt = pos + l
                    cost = dp[pos] + 1
                    if cost < next_dp[nxt]:
                        next_dp[nxt] =

In [16]:
from thom_replication.utils.LiveCodeBench import compute_code_score

In [19]:
compute_code_score(completion=response, test_cases=test_case_formatted, continuous=True)

TRYING THIS HERE
test cases: {'inputs': ['okpamfcgesjtsgunfftpgxlymkrbvlzzymaojkpjiukycnapawhoumzccyvtushhgoujzpsxvxaftieobigzbtejjodzrsfivqwg\n100\n10 f q r d q j d w q o\n10 r k d v m u i b h x\n10 z y e x f j z a p p\n10 d p n z q t r f t a\n10 a d y z m x y t m v\n10 r x w e v f d w x f\n10 i j d j j z t u c c\n10 q o g y w w l p r m\n10 r e p v d p z j t w\n10 n h a n v s i l n d\n10 x c n l p f e j l n\n10 y r p g q t z z w t\n10 r x c l f s d k s c\n10 n c m w a w r h y g\n10 v f b c c s q g u b\n10 j v y w n u i t m w\n10 s s t j s r g f q i\n10 r i p f b y g k a v\n10 p c l t j x e s h z\n10 k c q q p p g f t t\n10 s x g p h j e q k k\n10 c n x h v t w o f d\n10 l p x w s o g d t y\n10 a y s c u s t d u n\n10 e z f q m q t d k o\n10 r k v d q v b x x n\n10 x k l r a n q t g f\n10 j q f b e b u s b g\n10 m v n e u a y e x k\n10 y b j v a y k l p j\n10 i y k n z z e z l k\n10 d k s f a d c g z q\n10 v t l a n b t e y s\n10 q a i q j p u c m i\n10 s x c s a v y s a v\n10 c o p u 

(True,
 {'test_case': {'is_correct': '[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]'}})