In [21]:
import os
import json
import random
import numpy as np

In [22]:
benchmarks = ["HumanEval"]
models = ["llama3", "llama3.1", "mistral-nemo", "qwen2.5-coder"]

def tie_break(data):
    ties = [i for i in data if i[1][0] == data[0][1][0]]
    return random.choice(ties)

def get_entropy(data):
    probs = np.array(data)[:, 0].astype(float)
    probs = probs / np.sum(probs)
    entropy = -np.sum(probs * np.log2(probs))
    return entropy

def vote(mode, benchmark, model):
    with open(f"./preprocessed_results/{mode}/{benchmark}_{model}.json", "r") as f:
        data =json.load(f)
    result = []
    entropy = []

    for problem_id, samples in data.items():
        sample_dict = dict()
        for sample in samples:
            if sample[0] not in sample_dict:
                sample_dict[sample[0]] = [1, sample[1]["result"]]
            elif sample[0] in sample_dict:
                sample_dict[sample[0]][0] += 1
        major = sorted(sample_dict.items(), key=lambda x:x[1][0], reverse=True)
        entropy.append(get_entropy(list(sample_dict.values())))
        if "passed" in tie_break(major)[1][1]:
            result.append(True)
        else:
            result.append(False)
        
    return result, entropy

## Augmented

In [28]:
for bm in benchmarks:
    print(bm)
    for model in models:
        result, entr = vote("augmented", bm, model + "_augmented")
        print(f"{model}: {result.count(True)}/{len(result)}, entropy: {np.mean(entr):.4f}")

HumanEval
llama3: 55/164, entropy: 3.5220
llama3.1: 88/164, entropy: 3.2418
mistral-nemo: 54/164, entropy: 3.2060
qwen2.5-coder: 105/164, entropy: 2.2601


## AST-only

In [29]:
for bm in benchmarks:
    print(bm)
    for model in models:
        result, entr = vote("ast_only", bm, model)
        print(f"{model}: {result.count(True)}/{len(result)}, entropy: {np.mean(entr):.4f}")

HumanEval
llama3: 57/164, entropy: 3.3802
llama3.1: 88/164, entropy: 3.1383
mistral-nemo: 57/164, entropy: 3.0822
qwen2.5-coder: 107/164, entropy: 2.0959


## Variable Unified

In [30]:
for bm in benchmarks:
    print(bm)
    for model in models:
        result, entr = vote("var_unif", bm, model)
        print(f"{model}: {result.count(True)}/{len(result)}, entropy: {np.mean(entr):.4f}")

HumanEval
llama3: 60/164, entropy: 3.2169
llama3.1: 88/164, entropy: 2.9190
mistral-nemo: 55/164, entropy: 2.9634
qwen2.5-coder: 109/164, entropy: 1.8344
