In [1]:
import torch
from vllm import LLM, SamplingParams
from huggingface_hub import snapshot_download
from datasets import load_dataset
from transformers import AutoTokenizer

import os
import re
from pprint import pprint
from tqdm.auto import tqdm
from typing import Optional, List

Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info


In [3]:
DATASETS_DIR = "./datasets"
os.makedirs(DATASETS_DIR, exist_ok=True)

In [4]:
mbpp = load_dataset("google-research-datasets/mbpp", "sanitized", cache_dir=DATASETS_DIR)
humaneval = load_dataset("openai/openai_humaneval", cache_dir=DATASETS_DIR)

In [5]:
mbpp["train"][0]

{'source_file': 'Benchmark Questions Verification V2.ipynb',
 'task_id': 602,
 'prompt': 'Write a python function to find the first repeated character in a given string.',
 'code': 'def first_repeated_char(str1):\n  for index,c in enumerate(str1):\n    if str1[:index+1].count(c) > 1:\n      return c',
 'test_imports': [],
 'test_list': ['assert first_repeated_char("abcabc") == "a"',
  'assert first_repeated_char("abc") == None',
  'assert first_repeated_char("123123") == "1"']}

In [6]:
pprint(humaneval["test"][0])

{'canonical_solution': '    for idx, elem in enumerate(numbers):\n'
                       '        for idx2, elem2 in enumerate(numbers):\n'
                       '            if idx != idx2:\n'
                       '                distance = abs(elem - elem2)\n'
                       '                if distance < threshold:\n'
                       '                    return True\n'
                       '\n'
                       '    return False\n',
 'entry_point': 'has_close_elements',
 'prompt': 'from typing import List\n'
           '\n'
           '\n'
           'def has_close_elements(numbers: List[float], threshold: float) -> '
           'bool:\n'
           '    """ Check if in given list of numbers, are any two numbers '
           'closer to each other than\n'
           '    given threshold.\n'
           '    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n'
           '    False\n'
           '    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n'

Важные параметры для обучения модели:
- model
- tokenizer
- max_model_len
- dtype
- gpu_memory_utilization
- enforce_eager
- seed
- enable_prefix_caching

In [7]:
print(mbpp)
print(humaneval)

DatasetDict({
    train: Dataset({
        features: ['source_file', 'task_id', 'prompt', 'code', 'test_imports', 'test_list'],
        num_rows: 120
    })
    test: Dataset({
        features: ['source_file', 'task_id', 'prompt', 'code', 'test_imports', 'test_list'],
        num_rows: 257
    })
    validation: Dataset({
        features: ['source_file', 'task_id', 'prompt', 'code', 'test_imports', 'test_list'],
        num_rows: 43
    })
    prompt: Dataset({
        features: ['source_file', 'task_id', 'prompt', 'code', 'test_imports', 'test_list'],
        num_rows: 7
    })
})
DatasetDict({
    test: Dataset({
        features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
        num_rows: 164
    })
})


In [8]:
os.makedirs("models", exist_ok=True)
load_sft = True

# MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"
# MODEL_PATH = "models/qwen3-0.6b"
MODEL_PATH = "models/qwen3-4B-instruct-2507"

if load_sft:
	MODEL_PATH += "-sft"

In [9]:
snapshot_download(
	repo_id=MODEL_NAME,
	local_dir=MODEL_PATH,
	local_dir_use_symlinks=False,
	local_files_only=load_sft,
)

Returning existing local_dir `models/qwen3-4B-instruct-2507-sft` as remote repo cannot be accessed in `snapshot_download` (None).


'/home/pavel/projects/mouse-learning/cl-rl-course-work/models/qwen3-4B-instruct-2507-sft'

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

The tokenizer you are loading from 'models/qwen3-4B-instruct-2507-sft' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


In [11]:
llm = LLM(
	model=MODEL_PATH,
	max_model_len=1024,
	dtype="auto",
	gpu_memory_utilization=0.7,
	enforce_eager=False,
	seed=42,
	enable_prefix_caching=False,
	tokenizer_mode="auto",
	trust_remote_code=True,
)

INFO 01-16 02:26:46 [utils.py:253] non-default args: {'trust_remote_code': True, 'seed': 42, 'max_model_len': 1024, 'enable_prefix_caching': False, 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'model': 'models/qwen3-4B-instruct-2507-sft'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 01-16 02:26:46 [model.py:514] Resolved architecture: Qwen3ForCausalLM
INFO 01-16 02:26:46 [model.py:1661] Using max model len 1024


INFO 01-16 02:26:47 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.


The tokenizer you are loading from 'models/qwen3-4B-instruct-2507-sft' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.




Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info


[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:26:53 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='models/qwen3-4B-instruct-2507-sft', speculative_config=None, tokenizer='models/qwen3-4B-instruct-2507-sft', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metric

[W116 02:27:04.662807908 socket.cpp:209] [c10d] The hostname of the client socket cannot be retrieved. err=-3


[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:04 [parallel_state.py:1411] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0
[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:05 [gpu_model_runner.py:3562] Starting to load model models/qwen3-4B-instruct-2507-sft...


[0;36m(EngineCore_DP0 pid=13594)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:07 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  33% Completed | 1/3 [00:02<00:04,  2.36s/it]
Loading safetensors checkpoint shards:  67% Completed | 2/3 [00:02<00:01,  1.16s/it]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:06<00:00,  2.42s/it]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:06<00:00,  2.20s/it]
[0;36m(EngineCore_DP0 pid=13594)[0;0m 


[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:14 [default_loader.py:308] Loading weights took 6.78 seconds
[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:14 [gpu_model_runner.py:3659] Model loading took 7.6065 GiB memory and 9.345989 seconds
[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:21 [backends.py:643] Using cache directory: /home/pavel/.cache/vllm/torch_compile_cache/7c7b730590/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:21 [backends.py:703] Dynamo bytecode transform time: 5.83 s
[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:27 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:29 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 2.18 s
[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:29 [monitor.py:34] torch.compile takes 8.02 s in total
[0;36m(EngineCore_DP0 pid=13594)[0;

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 17.34it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 21.54it/s]


[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:35 [gpu_model_runner.py:4587] Graph capturing finished in 5 secs, took 0.53 GiB
[0;36m(EngineCore_DP0 pid=13594)[0;0m INFO 01-16 02:27:35 [core.py:259] init engine (profile, create kv cache, warmup model) took 20.97 seconds


[0;36m(EngineCore_DP0 pid=13594)[0;0m The tokenizer you are loading from 'models/qwen3-4B-instruct-2507-sft' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


INFO 01-16 02:27:36 [llm.py:360] Supported tasks: ['generate']


In [None]:
def extract_signature_from_mbpp_code(code: str) -> str:
    # Берём первую строку, начинающуюся с def ...
    for line in code.splitlines():
        line = line.strip()
        if line.startswith("def "):
            return line.rstrip(":")  # можно оставить или убрать :
    raise ValueError("No function signature found")


def build_mbpp_prompt(example: dict) -> str:
	task_text = example["prompt"]
	signature_line = extract_signature_from_mbpp_code(example["code"])

	system_msg = (
        "You are an expert Python coding assistant. "
        "Given a problem description and function signature, "
        "implement the function body so that it passes all tests."
    )
	user_msg = (
		"Problem:\n"
		f"{task_text}\n\n"
		"Use the following function signature:\n"
		f"{signature_line}:\n\n"
		"Write the full Python function implementation. "
		"Do NOT change the function name or arguments. "
		"Return only Python code."
	)

	messages = [
		{"role": "system", "content": system_msg},
		{"role": "user", "content": user_msg},
	]

	prompt = tokenizer.apply_chat_template(
		messages,
		tokenize=False,
		add_generation_prompt=True,
		enable_thinking=False,
	)
	return prompt


def build_humaneval_prompt(example: dict) -> str:
    """
    HumanEval: example["prompt"] уже содержит импорты + def + docstring.
    Модели нужно дописать реализацию и вернуть ТОЛЬКО Python-код без markdown.
    """
    task_id = example.get("task_id", "")
    task_text = example["prompt"].rstrip()  # убираем лишние пустые строки в конце
    entry_point = example.get("entry_point", "")

    system_msg = (
        "You are an expert Python coding assistant. "
        "You will be given a Python file snippet containing imports and a single function "
        "signature with a docstring. Complete the function implementation so it is correct."
    )

    user_msg = (
        f"Task ID: {task_id}\n"
        f"Function to implement: {entry_point}\n\n"
        "Complete the following code by writing the function body.\n"
        "- Keep all existing imports, the function name, and its arguments unchanged.\n"
        "- Do not modify the docstring.\n"
        "- You may add local helper functions if needed, but do not change the target signature.\n"
        "- Return ONLY valid Python code (no explanations, no markdown, no code fences).\n\n"
        "Code:\n"
        f"{task_text}\n"
    )

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg},
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )
    return prompt

In [13]:
sampling_params = SamplingParams(
	n=1,
	temperature=0.7,
	min_p=0.1,
	max_tokens=1024,
	ignore_eos=False,
	detokenize=True,
	logprobs=1,
	stop_token_ids=[
		tokenizer.eos_token_id,
		tokenizer.pad_token_id,
	],
	skip_special_tokens=False,
	repetition_penalty=1.05,
)

In [14]:
import multiprocessing as mp
import queue as queue_mod

def _run_code_and_test_worker(
    result_queue: mp.Queue,
    code: str,
    test: str,
):
    """
    Дочерний процесс: выполняет код + один тест.
    В result_queue кладёт True (успех) или False (исключение).
    """
    ns = {}
    try:
        exec(code, ns, ns)
        exec(test, ns, ns)
        result_queue.put(True)
    except Exception:
        result_queue.put(False)


def run_single_test_with_timeout(
    code: str,
    test: str,
    timeout_sec: float = 1.0,
) -> bool:
    """
    Запускает один тест в отдельном процессе.
    Возвращает True, если тест прошёл, False — если упал или истёк таймаут.
    """
    result_queue: mp.Queue = mp.Queue()
    p = mp.Process(
        target=_run_code_and_test_worker,
        args=(result_queue, code, test),
    )
    p.start()

    try:
        passed = result_queue.get(timeout=timeout_sec)
    except queue_mod.Empty:
        # таймаут — считаем тест не пройденным
        passed = False
    finally:
        # на всякий случай всегда прибиваем процесс
        if p.is_alive():
            p.terminate()
        p.join()

    return bool(passed)


In [15]:
def extract_code_from_completion(text: str) -> str:
    """
    Вырезаем код из ответа модели.
    Если есть блок ```python ... ```, забираем его.
    Иначе возвращаем текст как есть.
    """
    m = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL)
    if m:
        return m.group(1)
    return text


def extract_tests(test_str: str) -> List[str]:
    """
    Достаёт все строки/блоки, начинающиеся с `assert`, и возвращает их как отдельные тесты.
    Работает для типичного HumanEval, где asserts идут одной строкой.
    Поддерживает простые многострочные asserts в скобках.
    """
    if not test_str:
        return []

    lines = test_str.splitlines()
    tests: List[str] = []

    i = 0
    while i < len(lines):
        line = lines[i]
        if re.match(r"^\s*assert\b", line):
            buf = [line.rstrip()]
            i += 1

            # Если assert многострочный (обычно из-за открытых скобок), добираем строки,
            # пока баланс скобок не станет нулевым.
            text = "\n".join(buf)
            balance = text.count("(") - text.count(")")
            balance += text.count("[") - text.count("]")
            balance += text.count("{") - text.count("}")

            while i < len(lines) and balance > 0:
                buf.append(lines[i].rstrip())
                text = "\n".join(buf)
                balance = text.count("(") - text.count(")")
                balance += text.count("[") - text.count("]")
                balance += text.count("{") - text.count("}")
                i += 1

            tests.append("\n".join(buf).strip())
        else:
            i += 1

    return tests


def run_mbpp_tests_for_sample(
		raw_text: str,
		test_list: str,
) -> tuple[int, int]:
    """
    Выполнить сгенерированный код и прогнать тесты.
    Возвращает (num_passed, num_tests).
    """
    code = extract_code_from_completion(raw_text)
    tests = test_list

    num_tests = len(tests)
    num_passed = 0

    for t in tests:
        ok = run_single_test_with_timeout(
            code=code,
            test=t,
        )
        if ok:
            num_passed += 1
        # если False — тест либо кинул исключение, либо ушёл в таймаут

    return num_passed, num_tests


def run_humaneval_tests_for_sample(
		raw_text: str,
		test_str: list,
        entry_point: str,
) -> tuple[int, int]:
    """
    Выполнить сгенерированный код и прогнать тесты.
    Возвращает (num_passed, num_tests).
    """
    code = extract_code_from_completion(raw_text)
    tests = extract_tests(test_str)

    num_tests = len(tests)
    num_passed = 0

    for t in tests:
        ok = run_single_test_with_timeout(
            code=code,
            test=f"candidate = {entry_point}\n" + t,
        )
        if ok:
            num_passed += 1
        # если False — тест либо кинул исключение, либо ушёл в таймаут

    return num_passed, num_tests

In [16]:
def run_mbpp_bench(
		mbpp_dataset,
		split: str,
		llm: LLM,
		sampling_params: dict,
		batch_size: int = 2048,
):
	dataset = mbpp_dataset[split]
	tests = dataset["test_list"]

	outputs = []
	metrics = []
	batch_bar = tqdm(range(0, len(dataset), batch_size))
	for i in batch_bar:
		begin = i
		end = min(i + batch_size, len(dataset))
		batch = dataset.select(range(begin, end))
		prompts = [build_mbpp_prompt(x) for x in batch]
		outs = llm.generate(prompts, sampling_params)
		test_values = [
			run_mbpp_tests_for_sample(outs[j].outputs[0].text, tests[i+j])
			for j in range(
				batch_size
				if end == batch_size
				else len(dataset) - batch_size * i
			)
		]
		mets = [{"pass@1": passed // overall, "%passed": passed / overall,} for passed, overall in test_values]

		metrics.extend(mets)
		outputs.extend(outs)
	return outputs, metrics

In [17]:
def run_humaneval_bench(
		humaneval_dataset,
		split: str,
		llm: LLM,
		sampling_params: dict,
		batch_size: int = 32,
):
	dataset = humaneval_dataset[split]
	tests = dataset["test"]
	entry_point = dataset["entry_point"]

	outputs = []
	metrics = []
	batch_bar = tqdm(range(0, len(dataset), batch_size))
	for i in batch_bar:
		begin = i
		end = min(i + batch_size, len(dataset))
		batch = dataset.select(range(begin, end))
		prompts = [build_humaneval_prompt(x) for x in batch]
		outs = llm.generate(prompts, sampling_params)
		test_values = [
			run_humaneval_tests_for_sample(outs[j].outputs[0].text, tests[i+j], entry_point[j])
			for j in range(
				batch_size
				if end == batch_size
				else len(dataset) - batch_size * i
			)
		]
		mets = [{"pass@1": passed // overall, "%passed": passed / overall,} for passed, overall in test_values]

		metrics.extend(mets)
		outputs.extend(outs)
	return outputs, metrics

In [18]:
index = 10
code = humaneval["test"][index]["canonical_solution"]
task = humaneval["test"][index]["prompt"]
entry_point = humaneval["test"][index]["entry_point"]
test_str = humaneval["test"][index]["test"]
aboba = task + code

run_humaneval_tests_for_sample(aboba, test_str, entry_point)

(5, 5)

## Implementation

### MBPP

In [19]:
results_mbpp, metrics_mbpp = run_mbpp_bench(mbpp, "validation", llm, sampling_params)

  0%|          | 0/1 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/43 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/43 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [20]:
pprint(results_mbpp)
pprint(metrics_mbpp)

[RequestOutput(request_id=0, prompt='<|im_start|>system\nYou are an expert Python coding assistant. Given a problem description and function signature, implement the function body so that it passes all tests.<|im_end|>\n<|im_start|>user\nProblem:\nWrite a python function which takes a list of integers and only returns the odd ones.\n\nUse the following function signature:\ndef Split(list):\n\nWrite the full Python function implementation. Do NOT change the function name or arguments. Return only Python code.<|im_end|>\n<|im_start|>assistant\n', prompt_token_ids=[151644, 8948, 198, 2610, 525, 458, 6203, 13027, 10822, 17847, 13, 16246, 264, 3491, 4008, 323, 729, 11957, 11, 4211, 279, 729, 2487, 773, 429, 432, 16211, 678, 7032, 13, 151645, 198, 151644, 872, 198, 31198, 510, 7985, 264, 10135, 729, 892, 4990, 264, 1140, 315, 25780, 323, 1172, 4675, 279, 10322, 6174, 382, 10253, 279, 2701, 729, 11957, 510, 750, 27810, 7307, 7731, 7985, 279, 2480, 13027, 729, 8129, 13, 3155, 4183, 2297, 279, 

In [21]:
pprint(mbpp["train"][0]["test_list"])
pprint(results_mbpp[0].outputs[0].text)
pprint(metrics_mbpp[0])

['assert first_repeated_char("abcabc") == "a"',
 'assert first_repeated_char("abc") == None',
 'assert first_repeated_char("123123") == "1"']
'assistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantas

In [22]:
mean_pass1 = sum([x["pass@1"] for x in metrics_mbpp]) / len(metrics_mbpp)
mean_percent_passed = sum([x["%passed"] for x in metrics_mbpp]) / len(metrics_mbpp)

print(f"Mean pass@1: {mean_pass1}")
print(f"Mean % of passed tests: {mean_percent_passed}")

Mean pass@1: 0.0
Mean % of passed tests: 0.0


In [23]:
def calculate_entropy_from_logprobs(seq) -> float:
    logprobs_list = []
    for token_id, token_logprobs in zip(seq.token_ids, seq.logprobs):
        if token_id in token_logprobs:
            logprob_obj = token_logprobs[token_id]
            if hasattr(logprob_obj, 'logprob'):
                logprob_value = logprob_obj.logprob
            else:
                logprob_value = float(logprob_obj)
            logprobs_list.append(logprob_value)

    logprobs_tensor = torch.tensor(logprobs_list, dtype=torch.float)
    entropy = -logprobs_tensor.mean().item()
    return entropy

In [24]:
aboba = []
for output in results_mbpp:
	seq = output.outputs[0]
	generated_text = seq.text
	stop_type = seq.finish_reason
	response_tokens = len(seq.token_ids)
	entropy = calculate_entropy_from_logprobs(seq)

	aboba.append({
		"generated_text": generated_text,
		"stop_type": stop_type,
		"response_tokens": response_tokens,
		"entropy": entropy,
	})
mean_entropy = 1.0 / len(aboba) * sum([x["entropy"] for x in aboba])

In [25]:
pprint(mean_entropy)

0.00017820066418502138


In [26]:
pprint(aboba[0])

{'entropy': 0.00011850244482047856,
 'generated_text': 'assistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistan

### Humaneval

In [27]:
results_humaneval, metrics_humaneval = run_humaneval_bench(humaneval, "test", llm, sampling_params)

  0%|          | 0/6 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



KeyboardInterrupt: 

ERROR 01-16 02:28:50 [core_client.py:606] Engine core proc EngineCore_DP0 died unexpectedly, shutting down client.


In [None]:
pprint(results_humaneval)
pprint(metrics_humaneval)

In [None]:
pprint(humaneval["test"][0]["test"])
pprint(results_humaneval[0].outputs[0].text)
pprint(metrics_humaneval[0])

In [None]:
mean_pass1_he = sum([x["pass@1"] for x in metrics_humaneval]) / len(metrics_humaneval)
mean_percent_passed_he = sum([x["%passed"] for x in metrics_humaneval]) / len(metrics_humaneval)

print(f"Mean pass@1: {mean_pass1_he}")
print(f"Mean % of passed tests: {mean_percent_passed_he}")