In [None]:
def pass_at_K(passed_results_by_qid, k=[1, 10]):
    # Calculate pass@k.
    total, correct = [], []
    for passed in passed_results_by_qid.values():
        total.append(len(passed))
        correct.append(sum(passed))

    total = np.array(total)
    correct = np.array(correct)

    ks = k
    return {f"pass@{k}": round(float(_estimate_pass_at_k(total, correct, k).mean())*100, 1)
            for k in ks if (total >= k).all()}
            
def compute_score(execution_results):
    passed_results_by_qid = defaultdict(list)
    pylint_failed_count = 0
    sample_num = 1
    for item in execution_results:
        qid = item['qid']
        sample_num = len(item['predictions'])
        results = item['results']
        for result in results:
            passed_results_by_qid[qid].append(result['passed'])
            pylint_failed_count += 1 if result['result'].startswith("pylint fail") else 0
            
    k = [1] if sample_num == 1 else [10]
    scores = pass_at_K(passed_results_by_qid, k)
    passed_qids = sorted([k for k, v in passed_results_by_qid.items() if any(v)])
    print(f"Pass@{k[0]}: {scores}")
    print(f"Parsing Success Rate: {round(100*(1 - pylint_failed_count / (sample_num*len(execution_results))), 1)}%")
    print(f"Passed QIDs: {passed_qids}")

def load_json(file_path):
    if not os.path.exists(file_path):
        return []
    with open(file_path, 'r') as f:
        return json.load(f)

def main(prediction_file, score_only=False):

    execution_results = load_json(prediction_file.replace(".json", "_executed.json"))
    compute_score(execution_results)

In [2]:
import os  
import json  
from collections import defaultdict  
import numpy as np  
import pandas as pd 
import itertools


def _estimator(n: int, c: int, k: int) -> float:
    """
    Calculates comb(n - c, k) / comb(n, k).
    """
    if n - c < k:
        return 0
    return np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
def _estimate_pass_at_k(num_samples, num_correct, k) -> np.ndarray:
    """
    Estimates pass@k of each problem and returns them in an array.
    """
    if isinstance(num_samples, int):
        num_samples_it = itertools.repeat(num_samples, len(num_correct))
    else:
        assert len(num_samples) == len(num_correct)
        num_samples_it = iter(num_samples)

    return np.array([1.0 - _estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])


def pass_at_K(passed_results_by_qid, k=[1, 10]):
    # Calculate pass@k.
    total, correct = [], []
    for passed in passed_results_by_qid.values():
        total.append(len(passed))
        correct.append(sum(passed))

    total = np.array(total)
    correct = np.array(correct)

    ks = k
    return {f"pass@{k}": round(float(_estimate_pass_at_k(total, correct, k).mean())*100, 1)
            for k in ks if (total >= k).all()}
            
def compute_score(execution_results):  
    passed_results_by_qid = defaultdict(list)  
    pylint_failed_count = 0  
    sample_num = 1  
    for item in execution_results:  
        qid = item['qid']  
        sample_num = len(item['predictions'])  
        results = item['results']  
        for result in results:  
            passed_results_by_qid[qid].append(result['passed'])  
            pylint_failed_count += 1 if result['result'].startswith("pylint fail") else 0  
  
    k = [1] if sample_num == 1 else [10]  
    scores = pass_at_K(passed_results_by_qid, k)  
      
    pass_at_1 = scores.get('pass@1', None)  
    parsing_success_rate = round(100 * (1 - pylint_failed_count / (sample_num * len(execution_results))), 1)  
  
    # Return only relevant scores  
    return {  
        "Pass@1": pass_at_1,  
        "Parsing Success Rate": parsing_success_rate  
    } 
 
def load_json(file_path):
    if not os.path.exists(file_path):
        return []
    with open(file_path, 'r') as f:
        return json.load(f)
def process_files_in_output_directory():  
    results = {}  
    output_dir = 'output'  
    for file_name in os.listdir(output_dir):  
        if file_name.endswith('_executed.json'):  
            prediction_file = os.path.join(output_dir, file_name)  
            execution_results = load_json(prediction_file)  
            key = file_name.replace('_executed.json', '')  
            results[key] = compute_score(execution_results)  
  
    # Save results to CSV  
    df = pd.DataFrame.from_dict(results, orient='index')  
    df.to_csv('results_human_v.csv')  
process_files_in_output_directory()