In [1]:
!pip install human_eval

Collecting human_eval
  Downloading human_eval-1.0.3-py3-none-any.whl.metadata (153 bytes)
Collecting fire (from human_eval)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Downloading human_eval-1.0.3-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25ldone
[?25h  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114248 sha256=ba4039b8c5007bc722f7943e264a12b41e9433243848c1f3e62420a6370bbcd3
  Stored in directory: /root/.cache/pip/wheels/19/39/2f/2d3cadc408a8804103f1c34ddd4b9f6a93497b11fa96fe738e
Successfully built fire
Installing collected packages: fire, human_eval
Successfully installed fire-0.7.0 human_eval-1.0.3


In [2]:
import os
import json
import human_eval
import time
import requests
import json

from human_eval.data import write_jsonl, read_problems
from human_eval.evaluation import evaluate_functional_correctness
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

In [3]:
def benchmark_model(model, tokenizer, model_name, dataset_path=None):
    if dataset_path is None:
        dataset_path = os.path.join(
            os.path.dirname(human_eval.__file__), 
            'data', 
            'HumanEval.jsonl.gz'
        )
        
    problems = read_problems(dataset_path)
    
    solutions = []

    num_generations_per_problem = 1  # Ensure enough generations for pass@1, pass@10 and pass@100

    cumulative_time = 0
    for task_id, problem in tqdm(problems.items()):
        prompt = problem['prompt']
        task_solutions = []
        for _ in range(num_generations_per_problem):
            inputs = tokenizer(prompt, return_tensors="pt")
            start = time.time()
            generate_ids = model.generate(inputs.input_ids.cuda(), pad_token_id=tokenizer.eos_token_id, attention_mask=inputs["attention_mask"].cuda())
            cumulative_time += time.time() - start
            solution = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            task_solutions.append({
                'task_id': task_id,
                'prompt': prompt,
                'completion': solution
            })
        
        solutions.extend(task_solutions)

    
    output_path = f'{model_name}_humaneval_solutions.jsonl'
    write_jsonl(output_path, solutions)
    
    results = evaluate_functional_correctness(
        output_path, 
        n_workers=4,
        timeout=3.0,
        k=[1]
    )
    
    return {
        'model_name': model_name,
        'pass_at_1': results['pass@1'],
        'total_problems': len(problems),
        'solutions_path': output_path,
        'time': cumulative_time
    }

In [None]:
models = ['Qwen/Qwen2.5-Coder-3B', 'bigcode/starcoder2-3b', 'stabilityai/stable-code-3b', 'ibm-granite/granite-3b-code-base-2k', 'openbmb/MiniCPM3-4B']
model_name = 'Qwen/Qwen2.5-Coder-3B'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
results = benchmark_model(
    model,
    tokenizer,
    model_name=model_name
)

print(results)
print(json.dumps(results, indent=2))

with open(f'{results["model_name"]}_benchmark_results.json', 'w') as f:
    json.dump(results, f, indent=2)

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/139 [00:00<?, ?B/s]

  1%|          | 2/164 [01:39<2:36:04, 57.81s/it]