In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from tqdm.auto import tqdm
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Egor.Bogomolov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from metrics_evaluation.metrics import (
    codebleu,
    ruby,
    sentence_chrf,
    sentence_bleu,
    meteor,
    rouge,
    tokenize_tranx
)

In [4]:
def compute_metric_all(model_dictionary, model_list, metric_name, metric_function):
    print(f"Computing {metric_name}...")
    for item in tqdm(model_dictionary):
        for field in model_list:
            max_score = 0
            hypothesis = item[field]
            for snippet in item["snippet"]:
                cur_score = round(metric_function(snippet, hypothesis), 3)
                max_score = max(cur_score, max_score)
            item[f"{metric_name}-{field}"] = max_score

In [5]:
# ROUGE-L
def rougel_all(model_dictionary, model_list):
    compute_metric_all(
        model_dictionary,
        model_list,
        "rougel",
        lambda snippet, hypothesis: rouge._score_lcs(tokenize_tranx(snippet), tokenize_tranx(hypothesis)).fmeasure
    )

In [6]:
# codebleu
def codebleu_all(model_dictionary, model_list):
    compute_metric_all(
        model_dictionary,
        model_list,
        "codebleu",
        lambda snippet, hypothesis: codebleu(snippet, hypothesis)
    )

In [7]:
# ruby
def ruby_all(model_dictionary, model_list):
    compute_metric_all(
        model_dictionary,
        model_list,
        "ruby",
        lambda snippet, hypothesis: ruby(hypothesis, snippet)[0]
    )

In [8]:
# METEOR
def meteor_all(model_dictionary, model_list):
    compute_metric_all(
        model_dictionary,
        model_list,
        "meteor",
        lambda snippet, hypothesis: meteor(tokenize_tranx(snippet), tokenize_tranx(hypothesis))
    )

In [9]:
# chrF
def chrf_all(model_dictionary, model_list):
    print("Computing chrf...")
    for item in tqdm(model_dictionary):
        for field in model_list:
            item[f"chrf-{field}"] = round(sentence_chrf(item[field], item["snippet"]).score / 100, 3)

In [10]:
# BLEU
def bleu_all(model_dictionary, model_list):
    print("Computing bleu...")
    for item in tqdm(model_dictionary):
        for field in model_list:
            item[f"bleu-{field}"] = round(sentence_bleu(item[field], item["snippet"]).score / 100, 3)

In [11]:
def run_all_metrics(model_dictionary, model_list):
    rougel_all(model_dictionary, model_list)
    bleu_all(model_dictionary, model_list)
    chrf_all(model_dictionary, model_list)
    meteor_all(model_dictionary, model_list)
    codebleu_all(model_dictionary, model_list)
    # Ruby is slow as it computes graph edit distances which might take a while
    ruby_all(model_dictionary, model_list)

In [12]:
conala_models_list = ['baseline', 'tranx-annot', 'best-tranx', 'best-tranx-rerank', 'codex']
conala_data = json.load(open("data/to-grade/conala/conala-aggregated-grades.json"))
run_all_metrics(conala_data, conala_models_list)

Computing rougel...


  0%|          | 0/472 [00:00<?, ?it/s]

Computing bleu...


  0%|          | 0/472 [00:00<?, ?it/s]

Computing chrf...


  0%|          | 0/472 [00:00<?, ?it/s]

Computing meteor...


  0%|          | 0/472 [00:00<?, ?it/s]

Computing codebleu...


  0%|          | 0/472 [00:00<?, ?it/s]



Computing ruby...


  0%|          | 0/472 [00:00<?, ?it/s]



In [13]:
json.dump(conala_data, open("data/to-grade/conala/conala-all-grades.json", "w"))

In [14]:
hs_models_list = ['gcnn', 'nl2code']
hs_data = json.load(open("data/to-grade/hs/hs-aggregated-grades.json"))
run_all_metrics(hs_data, hs_models_list)

Computing rougel...


  0%|          | 0/66 [00:00<?, ?it/s]

Computing bleu...


  0%|          | 0/66 [00:00<?, ?it/s]

Computing chrf...


  0%|          | 0/66 [00:00<?, ?it/s]

Computing meteor...


  0%|          | 0/66 [00:00<?, ?it/s]

Computing codebleu...


  0%|          | 0/66 [00:00<?, ?it/s]

Computing ruby...


  0%|          | 0/66 [00:00<?, ?it/s]

In [15]:
json.dump(hs_data, open("data/to-grade/hs/hs-all-grades.json", "w"))