In [1]:
import os
import json
import random

In [2]:
benchmarks = ["APPS", "HumanEval"]
models = ["llama3", "llama3.1", "mistral-nemo", "qwen2.5-coder"]
modes = [None, "instruction", "rule"]

In [4]:
def tie_break(data):
    ties = [i for i in data if i[1][0] == data[0][1][0]]
    return random.choice(ties)


def vote(mode, benchmark, model, prompt_mode=None):
    if not prompt_mode:
        with open(f"./preprocessed_results/{mode}/{benchmark}_{model}.json", "r") as f:
            data =json.load(f)
    if prompt_mode:
        with open(f"./preprocessed_results/{mode}/{benchmark}_{model}_{prompt_mode}.json", "r") as f:
            data =json.load(f)
    result = []

    for problem_id, samples in data.items():
        sample_dict = dict()
        for sample in samples:
            if sample[0] not in sample_dict:
                sample_dict[sample[0]] = [1, sample[1]["result"]]
            elif sample[0] in sample_dict:
                sample_dict[sample[0]][0] += 1
        major = sorted(sample_dict.items(), key=lambda x:x[1][0], reverse=True)
        if "passed" in tie_break(major)[1][1]:
            result.append(True)
        else:
            result.append(False)
        
    return result

## Augmented

In [8]:
for bm in benchmarks:
    print(bm)
    for model in models:
        for mode in modes:
            result = vote("augmented", bm, model + "_augmented")
            print(f"{bm}_{model}_{mode}: {result.count(True)}/{len(result)}")

APPS
APPS_llama3_None: 19/300
APPS_llama3_instruction: 20/300
APPS_llama3_rule: 21/300
APPS_llama3.1_None: 18/300
APPS_llama3.1_instruction: 21/300
APPS_llama3.1_rule: 20/300
APPS_mistral-nemo_None: 25/300
APPS_mistral-nemo_instruction: 29/300
APPS_mistral-nemo_rule: 28/300
APPS_qwen2.5-coder_None: 18/300
APPS_qwen2.5-coder_instruction: 21/300
APPS_qwen2.5-coder_rule: 18/300
HumanEval
HumanEval_llama3_None: 53/164
HumanEval_llama3_instruction: 56/164
HumanEval_llama3_rule: 56/164
HumanEval_llama3.1_None: 83/164
HumanEval_llama3.1_instruction: 82/164
HumanEval_llama3.1_rule: 85/164
HumanEval_mistral-nemo_None: 55/164
HumanEval_mistral-nemo_instruction: 54/164
HumanEval_mistral-nemo_rule: 55/164
HumanEval_qwen2.5-coder_None: 109/164
HumanEval_qwen2.5-coder_instruction: 108/164
HumanEval_qwen2.5-coder_rule: 107/164


## AST-only

In [6]:
for bm in benchmarks:
    print(bm)
    for model in models:
        for mode in modes:
            result = vote("ast_only", bm, model)
            print(f"{bm}_{model}_{mode}: {result.count(True)}/{len(result)}")

APPS
APPS_llama3_None: 18/300
APPS_llama3_instruction: 21/300
APPS_llama3_rule: 23/300
APPS_llama3.1_None: 21/300
APPS_llama3.1_instruction: 18/300
APPS_llama3.1_rule: 19/300
APPS_mistral-nemo_None: 28/300
APPS_mistral-nemo_instruction: 34/300
APPS_mistral-nemo_rule: 32/300
APPS_qwen2.5-coder_None: 19/300
APPS_qwen2.5-coder_instruction: 21/300
APPS_qwen2.5-coder_rule: 18/300
HumanEval
HumanEval_llama3_None: 59/164
HumanEval_llama3_instruction: 60/164
HumanEval_llama3_rule: 61/164
HumanEval_llama3.1_None: 88/164
HumanEval_llama3.1_instruction: 90/164
HumanEval_llama3.1_rule: 88/164
HumanEval_mistral-nemo_None: 55/164
HumanEval_mistral-nemo_instruction: 58/164
HumanEval_mistral-nemo_rule: 58/164
HumanEval_qwen2.5-coder_None: 108/164
HumanEval_qwen2.5-coder_instruction: 108/164
HumanEval_qwen2.5-coder_rule: 108/164


## Variable Unified

In [7]:
for bm in benchmarks:
    print(bm)
    for model in models:
        for mode in modes:
            result = vote("var_unif", bm, model)
            print(f"{bm}_{model}_{mode}: {result.count(True)}/{len(result)}")

APPS
APPS_llama3_None: 19/300
APPS_llama3_instruction: 20/300
APPS_llama3_rule: 19/300
APPS_llama3.1_None: 20/300
APPS_llama3.1_instruction: 21/300
APPS_llama3.1_rule: 20/300
APPS_mistral-nemo_None: 31/300
APPS_mistral-nemo_instruction: 30/300
APPS_mistral-nemo_rule: 30/300
APPS_qwen2.5-coder_None: 19/300
APPS_qwen2.5-coder_instruction: 18/300
APPS_qwen2.5-coder_rule: 21/300
HumanEval
HumanEval_llama3_None: 62/164
HumanEval_llama3_instruction: 62/164
HumanEval_llama3_rule: 60/164
HumanEval_llama3.1_None: 91/164
HumanEval_llama3.1_instruction: 89/164
HumanEval_llama3.1_rule: 93/164
HumanEval_mistral-nemo_None: 56/164
HumanEval_mistral-nemo_instruction: 59/164
HumanEval_mistral-nemo_rule: 57/164
HumanEval_qwen2.5-coder_None: 111/164
HumanEval_qwen2.5-coder_instruction: 108/164
HumanEval_qwen2.5-coder_rule: 107/164
