In [31]:
import os
import json
import random
import numpy as np

In [40]:
benchmarks = ["APPS", "HumanEval"]
models = ["llama3", "llama3.1", "mistral-nemo", "qwen2.5-coder"]
modes = [None, "instruction", "rule"]

In [33]:
def tie_break(data):
    ties = [i for i in data if i[1][0] == data[0][1][0]]
    return random.choice(ties)

def get_entropy(data):
    probs = np.array(data)[:, 0].astype(float)
    probs = probs / np.sum(probs)
    entropy = -np.sum(probs * np.log2(probs))
    return entropy

def vote(mode, benchmark, model, prompt_mode=None):
    if not prompt_mode:
        with open(f"./preprocessed_results/{mode}/{benchmark}_{model}.json", "r") as f:
            data =json.load(f)
    if prompt_mode:
        with open(f"./preprocessed_results/{mode}/{benchmark}_{model}_{prompt_mode}.json", "r") as f:
            data =json.load(f)
    result = []
    entropy = []

    for problem_id, samples in data.items():
        sample_dict = dict()
        for sample in samples:
            if sample[0] not in sample_dict:
                sample_dict[sample[0]] = [1, sample[1]["result"]]
            elif sample[0] in sample_dict:
                sample_dict[sample[0]][0] += 1
        major = sorted(sample_dict.items(), key=lambda x:x[1][0], reverse=True)
        entropy.append(get_entropy(list(sample_dict.values())))
        if "passed" in tie_break(major)[1][1]:
            result.append(True)
        else:
            result.append(False)
        
    return result, entropy

## Augmented

In [41]:
for bm in benchmarks:
    print(bm)
    for model in models:
        for mode in modes:
            result, entr = vote("augmented", bm, model + "_augmented")
            print(f"{model}: {result.count(True)}/{len(result)}, entropy: {np.mean(entr):.4f}")

APPS
llama3: 22/300, entropy: 4.2075
llama3: 21/300, entropy: 4.2075
llama3: 19/300, entropy: 4.2075
llama3.1: 19/300, entropy: 4.2119
llama3.1: 18/300, entropy: 4.2119
llama3.1: 18/300, entropy: 4.2119
mistral-nemo: 29/300, entropy: 4.2141
mistral-nemo: 26/300, entropy: 4.2141
mistral-nemo: 27/300, entropy: 4.2141
qwen2.5-coder: 19/300, entropy: 4.1487
qwen2.5-coder: 17/300, entropy: 4.1487
qwen2.5-coder: 21/300, entropy: 4.1487
HumanEval
llama3: 60/164, entropy: 3.5220
llama3: 57/164, entropy: 3.5220
llama3: 52/164, entropy: 3.5220
llama3.1: 85/164, entropy: 3.2418
llama3.1: 88/164, entropy: 3.2418
llama3.1: 82/164, entropy: 3.2418
mistral-nemo: 56/164, entropy: 3.2060
mistral-nemo: 57/164, entropy: 3.2060
mistral-nemo: 55/164, entropy: 3.2060
qwen2.5-coder: 106/164, entropy: 2.2601
qwen2.5-coder: 106/164, entropy: 2.2601
qwen2.5-coder: 105/164, entropy: 2.2601


## AST-only

In [42]:
for bm in benchmarks:
    print(bm)
    for model in models:
        for mode in modes:
            result, entr = vote("ast_only", bm, model)
            print(f"{model}: {result.count(True)}/{len(result)}, entropy: {np.mean(entr):.4f}")

APPS
llama3: 20/300, entropy: 4.1782
llama3: 18/300, entropy: 4.1782
llama3: 19/300, entropy: 4.1782
llama3.1: 20/300, entropy: 4.1933
llama3.1: 19/300, entropy: 4.1933
llama3.1: 20/300, entropy: 4.1933
mistral-nemo: 28/300, entropy: 4.1939
mistral-nemo: 34/300, entropy: 4.1939
mistral-nemo: 31/300, entropy: 4.1939
qwen2.5-coder: 17/300, entropy: 4.1335
qwen2.5-coder: 22/300, entropy: 4.1335
qwen2.5-coder: 22/300, entropy: 4.1335
HumanEval
llama3: 60/164, entropy: 3.3802
llama3: 58/164, entropy: 3.3802
llama3: 61/164, entropy: 3.3802
llama3.1: 88/164, entropy: 3.1383
llama3.1: 90/164, entropy: 3.1383
llama3.1: 86/164, entropy: 3.1383
mistral-nemo: 56/164, entropy: 3.0822
mistral-nemo: 57/164, entropy: 3.0822
mistral-nemo: 58/164, entropy: 3.0822
qwen2.5-coder: 107/164, entropy: 2.0959
qwen2.5-coder: 108/164, entropy: 2.0959
qwen2.5-coder: 107/164, entropy: 2.0959


## Variable Unified

In [43]:
for bm in benchmarks:
    print(bm)
    for model in models:
        for mode in modes:
            result, entr = vote("var_unif", bm, model)
            print(f"{model}: {result.count(True)}/{len(result)}, entropy: {np.mean(entr):.4f}")

APPS
llama3: 21/300, entropy: 4.1645
llama3: 17/300, entropy: 4.1645
llama3: 19/300, entropy: 4.1645
llama3.1: 20/300, entropy: 4.1611
llama3.1: 20/300, entropy: 4.1611
llama3.1: 22/300, entropy: 4.1611
mistral-nemo: 35/300, entropy: 4.1611
mistral-nemo: 33/300, entropy: 4.1611
mistral-nemo: 33/300, entropy: 4.1611
qwen2.5-coder: 19/300, entropy: 4.1175
qwen2.5-coder: 21/300, entropy: 4.1175
qwen2.5-coder: 20/300, entropy: 4.1175
HumanEval
llama3: 63/164, entropy: 3.2169
llama3: 63/164, entropy: 3.2169
llama3: 57/164, entropy: 3.2169
llama3.1: 91/164, entropy: 2.9190
llama3.1: 91/164, entropy: 2.9190
llama3.1: 90/164, entropy: 2.9190
mistral-nemo: 58/164, entropy: 2.9634
mistral-nemo: 58/164, entropy: 2.9634
mistral-nemo: 56/164, entropy: 2.9634
qwen2.5-coder: 106/164, entropy: 1.8344
qwen2.5-coder: 107/164, entropy: 1.8344
qwen2.5-coder: 108/164, entropy: 1.8344
