In [1]:
import os
import re
import numpy as np
import pandas as pd
import copy
import glob
import json
import cmbagent
from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL, stream_jsonl
from human_eval.evaluation import evaluate_functional_correctness, estimate_pass_at_k
from human_eval.execution import check_correctness

from dotenv import load_dotenv

from collections import defaultdict, Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Union, Iterable, Dict
import itertools
import tqdm

In [10]:
problems = read_problems()

print(problems['HumanEval/10']['prompt'])



def is_palindrome(string: str) -> bool:
    """ Test if given string is a palindrome """
    return string == string[::-1]


def make_palindrome(string: str) -> str:
    """ Find the shortest palindrome that begins with a supplied string.
    Algorithm idea is simple:
    - Find the longest postfix of supplied string that is a palindrome.
    - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.
    >>> make_palindrome('')
    ''
    >>> make_palindrome('cat')
    'catac'
    >>> make_palindrome('cata')
    'catac'
    """



In [None]:
# This problem to which I am solving/answering is with tegars to simply  benchmarking cmbagent's responses.

# Architecture:
# Basically, load the problems in a pandas dataframe.
# Iterate over the dataframe and prompts to get the cmbagent responses
# as i saw within the tests, the code written by cmbagent is outputted within a file within the cmbagent_output folder, created loclaly
# The human-eval dataset comes with a method to benchmark the model.
# what i need to do is simply write a function or a host of funcitons that simply creates that response from cmbagent, stores it, and then take that response, compare it to the exact value using the method provided.

# Non-Parallelized Code

## Modified functions from the Humaneval Benchmark

In [7]:

def evaluate_functional_correctness(
    sample_file: str,
    k: List[int] = [1, 10, 100],
    n_workers: int = 4,
    timeout: float = 3.0,
    problem_file: str = HUMAN_EVAL,
    num_problems: int = 5 # I added this in to simply limit number of problems evaluated for cmbagent
):
    """
    Evaluates the functional correctness of generated samples, and writes
    results to f"{sample_file}_results.jsonl.gz"
    """


    problems = read_problems(problem_file)

    problems_items = list(problems.items())[:num_problems]  # I added this in to simply limit number of problems evaluated for cmbagent
    problems = dict(problems_items) # I added this in to simply limit number of problems evaluated for cmbagent


    # Check the generated samples against test suites.
    with ThreadPoolExecutor(max_workers=n_workers) as executor:

        futures = []
        completion_id = Counter()
        n_samples = 0
        results = defaultdict(list)

        print("Reading samples...")
        for sample in tqdm.tqdm(stream_jsonl(sample_file)):
            task_id = sample["task_id"]
            completion = sample["completion"]
            args = (problems[task_id], completion, timeout, completion_id[task_id])
            future = executor.submit(check_correctness, *args)
            futures.append(future)
            completion_id[task_id] += 1
            n_samples += 1

        assert len(completion_id) == len(problems), "Some problems are not attempted."

        print("Running test suites...")
        for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
            result = future.result()
            results[result["task_id"]].append((result["completion_id"], result))

    # Calculate pass@k.
    total, correct = [], []
    for result in results.values():
        result.sort()
        passed = [r[1]["passed"] for r in result]
        total.append(len(passed))
        correct.append(sum(passed))
    total = np.array(total)
    correct = np.array(correct)

    ks = k
    pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
                 for k in ks if (total >= k).all()}

    # Finally, save the results in one file:
    def combine_results():
        for sample in stream_jsonl(sample_file):
            task_id = sample["task_id"]
            result = results[task_id].pop(0)
            sample["result"] = result[1]["result"]
            sample["passed"] = result[1]["passed"]
            yield sample

    out_file = sample_file + "_results.jsonl"
    print(f"Writing results to {out_file}...")
    write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples))

    return pass_at_k


## code

In [3]:


def load_KEY():
    load_dotenv()
    return os.getenv('OPENAI_API_KEY')


def my_agent(task, api_key):
    results = cmbagent.one_shot(
        task=task,
        agent='engineer',
        engineer_model='gpt-4o-mini',
        api_keys={'OPENAI': api_key}
    )
    return results


def extract_code(result, function_signature):
    for message in result.get("chat_history", []):
        if message.get("name") == "engineer" and "```python" in message.get("content", ""):
            match = re.search(r"```python(.*?)```", message["content"], re.DOTALL)
            if match:
                code = match.group(1).strip()
                if not code.startswith("def "):
                    code = function_signature.rstrip() + "\n" + code
                return code
    print("CMBagent failed to create an output")
    return None


def save_completions(completions, data, path="completions.jsonl"):
    records = []
    for i, problem_completions in completions.items():
        problem_name = f'HumanEval/{i}'
        problem = data[problem_name]
        
        if isinstance(problem_completions, list):
            for completion in problem_completions:
                if completion:
                    records.append({
                        "task_id": problem_name,
                        "prompt": problem["prompt"],
                        "canonical_solution": problem.get("canonical_solution", ""),
                        "test": problem["test"],
                        "entry_point": problem["entry_point"],
                        "completion": completion
                    })
        else:
            if problem_completions:
                records.append({
                    "task_id": problem_name,
                    "prompt": problem["prompt"],
                    "canonical_solution": problem.get("canonical_solution", ""),
                    "test": problem["test"],
                    "entry_point": problem["entry_point"],
                    "completion": problem_completions
                })
    write_jsonl(path, records)


def run_benchmark(data, n_tasks, n_completions_per_task=10):
    completions = {}
    api_key = load_KEY()

    for i in range(n_tasks):
        problem_name = f'HumanEval/{i}'
        problem_i = data[problem_name]
        prompt = problem_i["prompt"]

        problem_completions = []
        for j in range(n_completions_per_task):
            print(f"Generating completion {j+1}/{n_completions_per_task} for problem {i}")
            result = my_agent(task=prompt, api_key=api_key)
            completion_j = extract_code(result, function_signature=prompt)
            problem_completions.append(completion_j)
        
        completions[i] = problem_completions

    save_completions(completions, data)
    
    eval_results = evaluate_functional_correctness(
        sample_file="completions.jsonl",
        k=[1, 2, 3, 5, 10, 13, 15],
        timeout=20.0,
        num_problems=n_tasks   
    )

    print(json.dumps(eval_results, indent=2))
    return eval_results

In [None]:
data = read_problems()
run_benchmark(data, n_tasks=3,  n_completions_per_task= 1) 

# Parallelized code

In [None]:
import os
import re
import json
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed 

from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL, stream_jsonl
from benchmark import benchmark_problem
from benchmark import evaluate_functional_correctness

def load_key():
    load_dotenv() 
    key = os.getenv('OPENAI_API_KEY')
    if not key:
        raise RuntimeError("OPENAI_API_KEY not set in worker")
    return key

def my_agent(task, api_key):
    return cmbagent.one_shot(
        task=task,
        agent='engineer',
        engineer_model='gpt-4o-mini',
        api_keys={'OPENAI': api_key}
    )

def extract_code(result, function_signature):
    for message in result.get("chat_history", []):
        if message.get("name") == "engineer" and "```python" in message.get("content", ""):
            m = re.search(r"```python(.*?)```", message["content"], re.DOTALL)
            if m:
                code = m.group(1).strip()
                if not code.startswith("def "):
                    code = function_signature.rstrip() + "\n" + code
                return code
    return None 

def save_completions(all_completions, data, path="completions.jsonl"):

    print("******************************************************************************")
    print("******************************************************************************")
    print("ALL THREADS COMPLETED")
    print("******************************************************************************")
    print("******************************************************************************")

    records = []
    for i, comp_list in all_completions.items():
        pname = f"HumanEval/{i}"
        prob = data[pname]
        for comp in comp_list:
            if comp:
                records.append({
                    "task_id": pname,
                    "prompt": prob["prompt"],
                    "canonical_solution": prob.get("canonical_solution", ""),
                    "test": prob["test"],
                    "entry_point": prob["entry_point"],
                    "completion": comp,
                })
    write_jsonl(path, records)

def run_benchmark_parallel(data, n_tasks, n_completions_per_task=10, max_workers=4):
    tasks = [(i, data[f"HumanEval/{i}"]) for i in range(n_tasks)]
    all_results = {}

    with ThreadPoolExecutor(max_workers=max_workers) as exe:
        futures = {
            exe.submit(benchmark_problem, i, problem, n_completions_per_task): i
            for i, problem in tasks
        }
        for fut in as_completed(futures):
            i = futures[fut]
            try:
                idx, completions = fut.result()
                all_results[idx] = completions
                print(f">>> Completed problem {idx}")
            except Exception as e:
                print(f"! problem {i} failed:", e)

    
    save_completions(all_results, data)
    eval_results = evaluate_functional_correctness(
        sample_file="completions.jsonl",
        k=[1, 2, 3, 5, 10, 13, 15],
        timeout=20.0,
        num_problems=n_tasks
    )
    print(json.dumps(eval_results, indent=2))
    return eval_results


## Benchmark Results

**Model being benchmarked:** cmbagent engineer agent with GPT-4o-mini

**Test Configuration:**
- num_questions: 30
- completions_per: 15  
- Total_completions: 450

**Note:** Results may not be the most accurate due to only evaluating cmbagent through 30 questions, or 18% of the HumanEval dataset. More comprehensive testing may be required for definitive results.

### Pass@k Metrics

| Metric | Score | Percentage |
|--------|-------|------------|
| **pass@1** | 0.9933 | **99.33%** |
| **pass@2** | 0.9990 | **99.90%** |
| **pass@3** | 0.9999 | **99.99%** |
| **pass@5** | 1.0000 | **100.00%** |
| **pass@10** | 1.0000 | **100.00%** |
| **pass@13** | 1.0000 | **100.00%** |
| **pass@15** | 1.0000 | **100.00%** |

### Summary

These are exceptionally strong results! The model achieves near-perfect performance:

- **99.33%** success rate with just 1 attempt
- **99.90%** success rate with 2 attempts  
- **Perfect 100%** success rate with 5 or more attempts

These results show exceptional performance on the HumanEval benchmark using the cmbagent multi-agent system in one-shot mode, with the agent successfully generating correct solutions on the first attempt 99.33% of the time and achieving perfect accuracy when given 5 or more attempts.

**Important:** Each completion was generated independently with no learning or improvement between attempts.

In [None]:

data = read_problems()
metrics = run_benchmark_parallel(
        data,
    n_tasks=30,
    n_completions_per_task=15,
    max_workers=3
)


from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (la

450it [00:00, 61434.00it/s]


Running test suites...


100%|██████████| 450/450 [00:23<00:00, 19.15it/s]


Writing results to completions.jsonl_results.jsonl...


100%|██████████| 450/450 [00:00<00:00, 77293.78it/s]

{
  "pass@1": 0.9933333333333334,
  "pass@2": 0.9990476190476191,
  "pass@3": 0.99992673992674,
  "pass@5": 1.0,
  "pass@10": 1.0,
  "pass@13": 1.0,
  "pass@15": 1.0
}



