In [1]:
import sys
from pathlib import Path
sys.path.append(Path("../modules").absolute().__str__())

from data_utils import eval_dataset, get_nested_values, load_dataset, safe_loc, make_dataframe_from_sparql_response
from evaluation_utils import (
    average_precision_wrapper,
    compute_metrics_for_two_df,
    compute_metrics_for_two_list,
    cross_product_func,
    is_correct_SPARQL_query,
    keep_id_columns,
    load_and_merge_evaluation_and_gold_dataset,
    precision_recall_fscore_support_wrapper,
)
import argparse
import json
import logging
import nltk
import os
import pandas as pd
import evaluate


In [2]:
arguments = {
    "dataset": "../outputs/batch_run/experiment_basic_vs_template_2/execution/Mistral-7B-Instruct-v0.2_rv32-ld0-bs1-p0-nta0-e3-basic-basic_engpeft-t0.2-topp0.95_executed.parquet.gzip",
    "preprocess_gold": "../outputs/batch_run/experiment_basic_vs_template/preprocessed_gold.json",
    "model": "Mistral-7B-Instruct-v0.2",
    "output": ".",
    "save_name": "test",
    "log_level": "warning",
    "log_file": "",
}

In [3]:
args = argparse.Namespace()
args.__dict__.update(arguments)
print(args)

numeric_log_level = getattr(logging, args.log_level.upper(), None)
if not isinstance(numeric_log_level, int):
    raise ValueError(f"Invalid log level: {args.log_level}.")
logging.basicConfig(filename=args.log_file if args.log_file else None, level=numeric_log_level)

if not os.path.exists(args.dataset):
    raise FileNotFoundError(f"The dataset file not found with path: {args.dataset}")

if args.preprocess_gold != None and not os.path.exists(args.preprocess_gold):
    raise FileNotFoundError(f"The preprocess gold dataset file not found with path: {args.preprocess_gold}")

nltk.download('wordnet', quiet=True)



True

In [4]:
df, df_exec_timeout, df_exec_fail, df_exec_empty, df_exec_to_eval, df_eval, df_gold_eval, df_gold_exec_timeout, df_gold_exec_fail, df_gold_exec_empty, df_gold_exec_to_eval, df_merged_eval = load_and_merge_evaluation_and_gold_dataset(args)

In [5]:
df_merged_eval = df_eval.copy()

# Merging manually
df_merged_eval["gold_eval"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval", default=None), axis=1)
df_merged_eval["gold_get_nested_values"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_get_nested_values", default=[]), axis=1)
df_merged_eval["gold_eval_df"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval_df", default=pd.DataFrame()), axis=1)
df_merged_eval["gold_id_columns"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_id_columns", default=pd.DataFrame()), axis=1)

In [6]:
# Computing metrics using scikit-learn

df_merged_eval['nested_metrics'] = df_merged_eval.apply(lambda x: compute_metrics_for_two_list(results=x['get_nested_values'], gold=x['gold_get_nested_values'], k=5), axis=1)
df_merged_eval['cross_metrics'] = df_merged_eval.apply(lambda x: compute_metrics_for_two_df(results=x['eval_df'], gold=x['gold_eval_df'], k=5), axis=1)
df_merged_eval['id_metrics'] = df_merged_eval.apply(lambda x: compute_metrics_for_two_df(results=x['id_columns'], gold=x['gold_id_columns'], k=5), axis=1)


In [7]:
nested_metrics = pd.DataFrame(data=df_merged_eval['nested_metrics'].map(lambda x: x._asdict()).to_list())
cross_metrics = pd.DataFrame(data=df_merged_eval['cross_metrics'].map(lambda x: x._asdict()).to_list())
id_metrics = pd.DataFrame(data=df_merged_eval['id_metrics'].map(lambda x: x._asdict()).to_list())

In [8]:
gnv_map = nested_metrics['mean_average_precision'].mean()
gnv_precision = nested_metrics['precision_k'].mean()
gnv_recall = nested_metrics['recall_k'].mean()
gnv_rr = nested_metrics['mean_reciprocal_rank'].mean()

print(gnv_map)
print(gnv_precision)
print(gnv_recall)
print(gnv_rr)

cross_map = cross_metrics['mean_average_precision'].mean()
cross_precision = cross_metrics['precision_k'].mean()
cross_recall = cross_metrics['recall_k'].mean()
cross_rr = cross_metrics['mean_reciprocal_rank'].mean()

print(cross_map)
print(cross_precision)
print(cross_recall)
print(cross_rr)

id_map = id_metrics['mean_average_precision'].mean()
id_precision = id_metrics['precision_k'].mean()
id_recall = id_metrics['recall_k'].mean()
id_rr = id_metrics['mean_reciprocal_rank'].mean()

print(id_map)
print(id_precision)
print(id_recall)
print(id_rr)

0.006215837539534543
0.008372093023255815
0.009302325581395349
0.012209302325581395
0.008229536591004113
0.011162790697674419
0.01627906976744186
0.01627906976744186
0.007919452064788817
0.008837209302325582
0.011627906976744186
0.009767441860465118


In [9]:
nltk.download("punkt", quiet=True)
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")
meteor_metric = evaluate.load("meteor")

decoded_labels = df_merged_eval['target_raw'].map(lambda x: "\n".join(nltk.sent_tokenize(x.strip()))).to_list()
decoded_preds = df_merged_eval['output'].map(lambda x: "\n".join(nltk.sent_tokenize(x.strip()))).to_list()

rouge_dict = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
# bleu_score = corpus_bleu([[x.split()] for x in df['target_template']], [x.split() for x in df['translated_prompt']])
bleu_dict = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
# meteor_dict = corpus_meteor(hypotheses=decoded_preds, references=decoded_labels)
meteor_dict = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)
correct_syntax = sum(list(map(lambda y: int(y[1]), df.apply(lambda x: is_correct_SPARQL_query(x['output']), axis=1).items()))) / len(df)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
serie = pd.Series(data=
    {
        "model_name": args.model,
        "num_rows": len(df),
        "num_gen_fail": len(df.loc[df['has_error'] == True]),
        "num_exec_timeout": len(df_exec_timeout),
        "num_exec_fail": len(df_exec_fail),
        "num_exec_empty": len(df_exec_empty),
        "num_exec_to_eval": len(df_exec_to_eval),
        "num_eval": len(df_eval),
        "num_eval_empty": len(df_eval.loc[df_eval['eval'].map(len) == 0]),
        "gold_num_rows": len(df_gold_eval),
        "gold_num_exec_timeout": len(df_gold_exec_timeout),
        "gold_num_exec_fail": len(df_gold_exec_fail),
        "gold_num_exec_empty": len(df_gold_exec_empty),
        "gold_num_exec_to_eval": len(df_gold_exec_to_eval),
        "gold_num_eval_empty": len(df_gold_eval.loc[df_gold_eval['gold_eval'].map(len) == 0]),
        "bleu_score": bleu_dict["bleu"],
        "meteor_score": meteor_dict['meteor'],
        **rouge_dict,
        "get_nested_values_precision": gnv_precision,
        "get_nested_values_recall": gnv_recall,
        "get_nested_values_mean_reciprocal_rank": gnv_rr,
        "get_nested_values_mean_average_precision": gnv_map,
        "id_precision": id_precision,
        "id_recall": id_recall,
        "id_mean_reciprocal_rank": id_rr,
        "id_mean_average_precision": id_map,
        "cross_precision": cross_precision,
        "cross_recall": cross_recall,
        "cross_mean_reciprocal_rank": cross_rr,
        "cross_mean_average_precision": cross_map,
        "correct_syntax": correct_syntax,
    })

In [11]:
serie

model_name                                  Mistral-7B-Instruct-v0.2
num_rows                                                         513
num_gen_fail                                                      46
num_exec_timeout                                                  15
num_exec_fail                                                     22
num_exec_empty                                                     0
num_exec_to_eval                                                 430
num_eval                                                         430
num_eval_empty                                                   406
gold_num_rows                                                    503
gold_num_exec_timeout                                             10
gold_num_exec_fail                                                 0
gold_num_exec_empty                                                0
gold_num_exec_to_eval                                            503
gold_num_eval_empty               