In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from collections import Counter, defaultdict
from tqdm.auto import tqdm
import numpy as np
import random

from metrics_evaluation.analysis import bootstrap

In [3]:
np.random.seed(42)
random.seed(42)

In [4]:
def synthesize_models(model_dictionary, models, metrics_list):
    percentage_changes = [1, 3, 5, 10, 15, 20, 25, 30]
    improvement_flag = [True, False]

    for base_model in tqdm(models):
        other_models = [model for model in models if model != base_model]
        for flag in improvement_flag:
            for percentage in percentage_changes:
                bootstrap.synthesize_model(
                    model_dictionary,
                    base_model,
                    other_models,
                    percentage,
                    metrics_list,
                    improve = flag
                )

In [5]:
def remove_redundant_models(model_dictionary, metrics_list, model):
    for entry in model_dictionary:
        del entry[model]
        for metric in metrics_list:
            del entry[f"{metric}-{model}"]

def clean_models(model_dictionary, metrics_list):
    all_models_list = []
    candidates = model_dictionary[0].keys()
    for candidate in candidates:
        metric = candidate.split('-')[0]
        if metric not in metrics_list:
            all_models_list.append(candidate)

    if 'intent' in all_models_list:
        all_models_list.remove('intent')
    if 'snippet' in all_models_list:
        all_models_list.remove('snippet')

    print(all_models_list)

    models_to_remove = set()
    for i, model_1 in enumerate(all_models_list):
        for model_2 in all_models_list[:i]:
            equivalent_models = True
            for item in model_dictionary:
                if not equivalent_models:
                    break
                if item[model_1] != item[model_2]:
                    equivalent_models = False

            if equivalent_models:
                models_to_remove.add(model_2)
                print(model_2)

    for model in models_to_remove:
        all_models_list.remove(model)
        remove_redundant_models(model_dictionary, metrics_list, model)

    return all_models_list

In [6]:
def run_bootstrap(model_dictionary, metrics_list, all_models_list, models, dataset_name):
    model_pairs, model_scores = bootstrap.bootstrap(
        model_dictionary,
        all_models_list,
        metrics_list,
        bootstrap_sampling = 500
    )
    with open(f"data/to-grade/{dataset_name}-model-pairs.json", "w") as o:
        json.dump(model_pairs, o)

    with open(f"data/to-grade/{dataset_name}-model-scores.json", "w") as o:
        json.dump(model_scores, o)

    for i, model_1 in enumerate(models):
        for model_2 in models[:i]:
            print(model_1, model_2, model_pairs[model_1][model_2])

    return model_scores, model_pairs

In [7]:
def diff_into_bins(score1, score2):
    if abs(score1 - score2) < 2:
        return "0-2"
    elif 2 <= abs(score1 - score2) < 5:
        return "2-5"
    elif 5 <= abs(score1 - score2) < 10:
        return "5-10"
    else:
        return "10-100"


def get_splitting(model_scores, model_pairs, metrics_list, all_models_list, models):
    significance_splitting = dict()

    bootstrap_metrics = [metric for metric in metrics_list if metric != "grade"]

    for metric in bootstrap_metrics:
        significance_splitting[metric] = dict()
        for bucket in ["0-2", "2-5", "5-10", "10-100"]:
            significance_splitting[metric][bucket] = [0, 0]

        for i, model_1 in enumerate(all_models_list):
            for model_2 in all_models_list[:i]:
                model1_score = model_scores[model_1][metric] * 100
                model2_score = model_scores[model_2][metric] * 100

                metric_significance = model_pairs[model_1][model_2][metric]
                bucket = diff_into_bins(model1_score, model2_score)
                if 0.95 > metric_significance > 0.05:
                    significance_splitting[metric][bucket][1] += 1
                else:
                    significance_splitting[metric][bucket][0] += 1

    print("Significance splitting:")
    print(significance_splitting)
    print()

    splitting = bootstrap.split_into_bins(
        model_pairs,
        model_scores,
        bootstrap_metrics,
        all_models_list
    )

    bins_distribution = {}
    for metric in bootstrap_metrics:
        bins_distribution[metric] = defaultdict(Counter)
        for value, grade in splitting[metric]:
            if value != 'NS':
                value = int(value * 100)
            bins_distribution[metric][value][grade] += 1

    print("Bins distribution:")
    print(bins_distribution)
    print()

    """
    -1 -- 1st type error: metric says A > B, graders say A < B
     0 -- ok: metric agrees with graders
     1 -- 1st type error: metric says A > B, graders say A ~= B
     2 -- 2nd type error: metric says A ~= B, graders say A != B
    """
    error_types = (-1, 0, 1, 2)

    bins_aggregated = dict()
    all_bins = [
        [0, 1],
        list(range(2, 5)),
        list(range(5, 10)),
        list(range(10, 100))
    ]

    for metric in bootstrap_metrics:
        bins_aggregated[metric] = defaultdict(Counter)
        bins_aggregated[metric]['NS'] = bins_distribution[metric]['NS']
        for bucket in all_bins:
            bucket_name = bucket[0]
            for item in bucket:
                for error in error_types:
                    bins_aggregated[metric][bucket_name][error] += bins_distribution[metric][item][error]

    print("Bins aggregated:")
    print(bins_aggregated)
    print()

    print("Statistics:")
    for metric in bootstrap_metrics:
        num_where_metric_fails = 0
        num_all_bootstrap_pairs = 0
        for (_, value) in bins_aggregated[metric].items():
            for (key1, value1) in value.items():
                num_all_bootstrap_pairs += value1
                if key1 != 0:
                    num_where_metric_fails += value1

        print(metric, num_all_bootstrap_pairs, num_where_metric_fails / num_all_bootstrap_pairs)
    print()

    print("Deviations:")
    for model in models:
        for metric in metrics_list:
            print(model, metric, model_scores[model][metric], - model_scores[model][metric] + model_scores[model][metric+'-low'], - model_scores[model][metric] + model_scores[model][metric+'-high'])

    for model in models:
        for metric in ['grade']:
            print(
                model,
                metric,
                25 * model_scores[model][metric],
                25 * (- model_scores[model][metric] + model_scores[model][metric + '-low']),
                25 * (- model_scores[model][metric] + model_scores[model][metric + '-high'])
            )

In [8]:
metrics = ['grade', 'rougel', 'codebleu', 'chrf', 'meteor', 'ruby', 'bleu']
conala_models = ['baseline', 'tranx-annot', 'best-tranx', 'best-tranx-rerank', 'codex']
hs_models = ['gcnn', 'nl2code']

In [9]:
conala_grades = json.load(open("data/to-grade/conala/conala-all-grades.json"))
synthesize_models(conala_grades, conala_models, metrics)
all_models_list = clean_models(conala_grades, metrics)
model_scores, model_pairs = run_bootstrap(conala_grades, metrics, all_models_list, conala_models, "conala")
get_splitting(model_scores, model_pairs, metrics, all_models_list, conala_models)

  0%|          | 0/5 [00:00<?, ?it/s]

Failed to generate enough changed snippets. 23 new snippets are lacking.
Failed to generate enough changed snippets. 47 new snippets are lacking.
Failed to generate enough changed snippets. 71 new snippets are lacking.
Failed to generate enough changed snippets. 4 new snippets are lacking.
['baseline', 'tranx-annot', 'best-tranx', 'best-tranx-rerank', 'codex', 'baseline_1_1', 'baseline_3_1', 'baseline_5_1', 'baseline_10_1', 'baseline_15_1', 'baseline_20_1', 'baseline_25_1', 'baseline_30_1', 'baseline_1_0', 'baseline_3_0', 'baseline_5_0', 'baseline_10_0', 'baseline_15_0', 'baseline_20_0', 'baseline_25_0', 'baseline_30_0', 'tranx-annot_1_1', 'tranx-annot_3_1', 'tranx-annot_5_1', 'tranx-annot_10_1', 'tranx-annot_15_1', 'tranx-annot_20_1', 'tranx-annot_25_1', 'tranx-annot_30_1', 'tranx-annot_1_0', 'tranx-annot_3_0', 'tranx-annot_5_0', 'tranx-annot_10_0', 'tranx-annot_15_0', 'tranx-annot_20_0', 'tranx-annot_25_0', 'tranx-annot_30_0', 'best-tranx_1_1', 'best-tranx_3_1', 'best-tranx_5_1', 'be

100%|██████████| 500/500 [48:49<00:00,  5.86s/it]


tranx-annot baseline {'grade': 1.0, 'rougel': 1.0, 'codebleu': 0.986, 'chrf': 1.0, 'meteor': 1.0, 'ruby': 0.462, 'bleu': 1.0}
best-tranx baseline {'grade': 1.0, 'rougel': 1.0, 'codebleu': 1.0, 'chrf': 1.0, 'meteor': 1.0, 'ruby': 0.834, 'bleu': 1.0}
best-tranx tranx-annot {'grade': 1.0, 'rougel': 0.994, 'codebleu': 0.894, 'chrf': 1.0, 'meteor': 0.99, 'ruby': 0.918, 'bleu': 0.998}
best-tranx-rerank baseline {'grade': 1.0, 'rougel': 1.0, 'codebleu': 0.998, 'chrf': 1.0, 'meteor': 1.0, 'ruby': 0.978, 'bleu': 1.0}
best-tranx-rerank tranx-annot {'grade': 1.0, 'rougel': 1.0, 'codebleu': 0.922, 'chrf': 1.0, 'meteor': 1.0, 'ruby': 0.998, 'bleu': 1.0}
best-tranx-rerank best-tranx {'grade': 0.998, 'rougel': 0.998, 'codebleu': 0.706, 'chrf': 1.0, 'meteor': 0.996, 'ruby': 0.998, 'bleu': 0.992}
codex baseline {'grade': 1.0, 'rougel': 1.0, 'codebleu': 1.0, 'chrf': 1.0, 'meteor': 1.0, 'ruby': 1.0, 'bleu': 1.0}
codex tranx-annot {'grade': 1.0, 'rougel': 1.0, 'codebleu': 1.0, 'chrf': 1.0, 'meteor': 1.0, 

In [10]:
hs_grades = json.load(open("data/to-grade/hs/hs-all-grades.json"))
synthesize_models(hs_grades, hs_models, metrics)
all_models_list = clean_models(hs_grades, metrics)
model_scores, model_pairs = run_bootstrap(hs_grades, metrics, all_models_list, hs_models, "hs")
get_splitting(model_scores, model_pairs, metrics, all_models_list, hs_models)

  0%|          | 0/2 [00:00<?, ?it/s]

Failed to generate enough changed snippets. 1 new snippets are lacking.
Failed to generate enough changed snippets. 3 new snippets are lacking.
Failed to generate enough changed snippets. 7 new snippets are lacking.
Failed to generate enough changed snippets. 3 new snippets are lacking.
Failed to generate enough changed snippets. 7 new snippets are lacking.
Failed to generate enough changed snippets. 1 new snippets are lacking.
['gcnn', 'nl2code', 'gcnn_1_1', 'gcnn_3_1', 'gcnn_5_1', 'gcnn_10_1', 'gcnn_15_1', 'gcnn_20_1', 'gcnn_25_1', 'gcnn_30_1', 'gcnn_1_0', 'gcnn_3_0', 'gcnn_5_0', 'gcnn_10_0', 'gcnn_15_0', 'gcnn_20_0', 'gcnn_25_0', 'gcnn_30_0', 'nl2code_1_1', 'nl2code_3_1', 'nl2code_5_1', 'nl2code_10_1', 'nl2code_15_1', 'nl2code_20_1', 'nl2code_25_1', 'nl2code_30_1', 'nl2code_1_0', 'nl2code_3_0', 'nl2code_5_0', 'nl2code_10_0', 'nl2code_15_0', 'nl2code_20_0', 'nl2code_25_0', 'nl2code_30_0']
gcnn_20_0
gcnn_20_0
gcnn_25_0
nl2code_20_1
nl2code_20_1
nl2code_25_1


100%|██████████| 500/500 [04:48<00:00,  1.73it/s]


nl2code gcnn {'grade': 0.844, 'rougel': 0.946, 'codebleu': 0.66, 'chrf': 0.444, 'meteor': 0.998, 'ruby': 0.836, 'bleu': 0.998}
Significance splitting:
{'rougel': {'0-2': [109, 183], '2-5': [143, 0], '5-10': [0, 0], '10-100': [0, 0]}, 'codebleu': {'0-2': [61, 347], '2-5': [13, 14], '5-10': [0, 0], '10-100': [0, 0]}, 'chrf': {'0-2': [169, 167], '2-5': [99, 0], '5-10': [0, 0], '10-100': [0, 0]}, 'meteor': {'0-2': [46, 177], '2-5': [203, 9], '5-10': [0, 0], '10-100': [0, 0]}, 'ruby': {'0-2': [91, 252], '2-5': [89, 3], '5-10': [0, 0], '10-100': [0, 0]}, 'bleu': {'0-2': [42, 144], '2-5': [153, 19], '5-10': [77, 0], '10-100': [0, 0]}}

Bins distribution:
{'rougel': defaultdict(<class 'collections.Counter'>, {'NS': Counter({2: 98, 0: 85}), 0: Counter({0: 49, 1: 1}), 1: Counter({0: 57, 1: 2}), 2: Counter({0: 115, 1: 1}), 3: Counter({0: 27})}), 'codebleu': defaultdict(<class 'collections.Counter'>, {'NS': Counter({2: 273, 0: 88}), 0: Counter({0: 22, -1: 4, 1: 1}), 1: Counter({0: 34}), 2: Counter