In [1]:
import sys
from pathlib import Path
sys.path.append(Path("../modules").absolute().__str__())

from data_utils import eval_dataset, get_nested_values, load_dataset, safe_loc, make_dataframe_from_sparql_response
from evaluation_utils import  is_correct_SPARQL_query, keep_id_columns, cross_product_func, precision_recall_fscore_support_wrapper, average_precision_wrapper
import argparse
import json
import logging
import nltk
import os
import pandas as pd
import evaluate


In [2]:
arguments = {
    "dataset": "../datasets/Mistral-7B-Instruct-v0.2_rv16-ld0-bs1-p0-nta0-e3-basic_engpeft-t0.2-topp0.95_executed.parquet.gzip",
    "preprocess_gold": "../datasets/preprocessed_gold.json",
    "model": "Mistral-7B-Instruct-v0.2",
    "output": ".",
    "save_name": "test",
    "log_level": "warning",
    "log_file": "",
}

In [3]:
args = argparse.Namespace()
args.__dict__.update(arguments)
print(args)

numeric_log_level = getattr(logging, args.log_level.upper(), None)
if not isinstance(numeric_log_level, int):
    raise ValueError(f"Invalid log level: {args.log_level}.")
logging.basicConfig(filename=args.log_file if args.log_file else None, level=numeric_log_level)

if not os.path.exists(args.dataset):
    raise FileNotFoundError(f"The dataset file not found with path: {args.dataset}")

if args.preprocess_gold != None and not os.path.exists(args.preprocess_gold):
    raise FileNotFoundError(f"The preprocess gold dataset file not found with path: {args.preprocess_gold}")

nltk.download('wordnet', quiet=True)



True

In [4]:
df = load_dataset(args.dataset)
df_exec_timeout = df.loc[df['execution'] == 'timeout']
df_exec_fail = df.loc[df['execution'].str.startswith('exception')]
df_exec_empty = df.loc[df['execution'].isnull()]
df_exec_to_eval = df.drop(df_exec_timeout.index).drop(df_exec_fail.index).drop(df_exec_empty.index)
df_eval = eval_dataset(df_exec_to_eval)
df_eval['get_nested_values'] = df_eval.apply(lambda x: get_nested_values(x['eval']), axis=1)
df_eval['eval_df'] = df_eval.apply(lambda x: make_dataframe_from_sparql_response(x['eval']), axis=1)
df_eval['id_columns'] = df_eval.apply(lambda x: keep_id_columns(x['eval_df']), axis=1)

In [5]:
df_gold_eval = None
# if args.gold != None:
#     df_gold = load_dataset(args.gold)
#     df_gold_exec_timeout = df_gold.loc[df_gold['execution'] == 'timeout']
#     df_gold_exec_fail = df_gold.loc[df_gold['execution'].str.startswith('exception')]
#     df_gold_exec_empty = df_gold.loc[df_gold['execution'].isnull()]
#     df_gold_exec_to_eval = df_gold.drop(df_gold_exec_timeout.index).drop(df_gold_exec_fail.index).drop(df_gold_exec_empty.index)
#     df_gold_eval = eval_dataset(df_gold_exec_to_eval, "gold_eval")
#     df_gold_eval['gold_get_nested_values'] = df_gold_eval.apply(lambda x: get_nested_values(x['gold_eval']), axis=1)
# else:
with open(args.preprocess_gold, "r") as f:
    data = json.load(f)
df_gold_eval = pd.read_json(data['df_gold_eval'])

# TODO: newer version shouldn't need those 2 lines below
df_gold_eval['gold_eval_df'] = df_gold_eval.apply(lambda x: make_dataframe_from_sparql_response(x['gold_eval']), axis=1)
df_gold_eval['gold_id_columns'] = df_gold_eval.apply(lambda x: keep_id_columns(x['gold_eval_df']), axis=1)

In [32]:
df_gold_eval.columns

Index(['query', 'description', 'context', 'prompt', 'num_tokens',
       'start_with_SELECT', 'result', 'full_answer', 'is_skipped',
       'is_prompt_too_long', 'execution', 'executed_query', 'input',
       'target_template', 'target_raw', 'gold_eval', 'gold_get_nested_values',
       'gold_eval_df', 'gold_id_columns'],
      dtype='object')

In [34]:
df_gold_exec_timeout = df_gold_eval.loc[df_gold_eval['execution'] == 'timeout']
df_gold_exec_fail = df_gold_eval.loc[df_gold_eval['execution'].str.startswith('exception')]
df_gold_exec_empty = df_gold_eval.loc[df_gold_eval['execution'].isnull()]
df_gold_exec_to_eval = df_gold_eval.drop(df_gold_exec_timeout.index).drop(df_gold_exec_fail.index).drop(df_gold_exec_empty.index)

In [6]:
df_merged_eval = df_eval.copy()

# Merging manually
df_merged_eval["gold_eval"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval", default=None), axis=1)
df_merged_eval["gold_get_nested_values"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_get_nested_values", default=[]), axis=1)
df_merged_eval["gold_eval_df"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval_df", default=pd.DataFrame()), axis=1)
df_merged_eval["gold_id_columns"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_id_columns", default=pd.DataFrame()), axis=1)

In [7]:
# Computing metrics using scikit-learn

df_merged_eval['get_nested_values_precision_recall_fscore'] = df_merged_eval.apply(lambda x: precision_recall_fscore_support_wrapper(
    x['gold_get_nested_values'],
    x['get_nested_values']
), axis=1)

df_merged_eval['cross_precision_recall_fscore'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=precision_recall_fscore_support_wrapper,
    y_true=x['gold_eval_df'].apply(lambda y: y.fillna(value="")),
    y_pred=x['eval_df'].apply(lambda y: y.fillna(value="")),
    maximization=True,
    use_binarizer=True,
    average="samples"
)
, axis=1)

df_merged_eval['id_precision_recall_fscore'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=precision_recall_fscore_support_wrapper,
    y_true=x['gold_id_columns'].apply(lambda y: y.fillna(value="")),
    y_pred=x['id_columns'].apply(lambda y: y.fillna(value="")),
    maximization=True,
    use_binarizer=True,
    average="samples"
)
, axis=1)

In [8]:
# Computing average precision with custom function
df_merged_eval['get_nested_values_average_precision'] = df_merged_eval.apply(lambda x: average_precision_wrapper(
    y_true=x['gold_get_nested_values'],
    y_pred=x['get_nested_values']
), axis=1)

df_merged_eval['cross_average_precision'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=average_precision_wrapper,
    y_true=x['gold_eval_df'].apply(lambda y: y.fillna(value="")),
    y_pred=x['eval_df'].apply(lambda y: y.fillna(value="")),
    maximization=True,
)
, axis=1)

df_merged_eval['id_average_precision'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=average_precision_wrapper,
    y_true=x['gold_id_columns'].apply(lambda y: y.fillna(value="")),
    y_pred=x['id_columns'].apply(lambda y: y.fillna(value="")),
    maximization=True,
)
, axis=1)

In [9]:
gnv_precision = df_merged_eval['get_nested_values_precision_recall_fscore'].map(lambda r: r[0] if isinstance(r, tuple) else 0).mean()
gnv_recall = df_merged_eval['get_nested_values_precision_recall_fscore'].map(lambda r: r[1] if isinstance(r, tuple) else 0).mean()
gnv_fscore = df_merged_eval['get_nested_values_precision_recall_fscore'].map(lambda r: r[2] if isinstance(r, tuple) else 0).mean()

print(gnv_precision)
print(gnv_recall)
print(gnv_fscore)

cross_precision = df_merged_eval['cross_precision_recall_fscore'].map(lambda r: r[0] if isinstance(r, tuple) else 0).mean()
cross_recall = df_merged_eval['cross_precision_recall_fscore'].map(lambda r: r[1] if isinstance(r, tuple) else 0).mean()
cross_fscore = df_merged_eval['cross_precision_recall_fscore'].map(lambda r: r[2] if isinstance(r, tuple) else 0).mean()

print(cross_precision)
print(cross_recall)
print(cross_fscore)

id_precision = df_merged_eval['id_precision_recall_fscore'].map(lambda r: r[0] if isinstance(r, tuple) else 0).mean()
id_recall = df_merged_eval['id_precision_recall_fscore'].map(lambda r: r[1] if isinstance(r, tuple) else 0).mean()
id_fscore = df_merged_eval['id_precision_recall_fscore'].map(lambda r: r[2] if isinstance(r, tuple) else 0).mean()

print(id_precision)
print(id_recall)
print(id_fscore)

0.009584664536741214
0.00044414841004824075
0.0008038206890048989
0.009584664536741214
0.0015160835830373266
0.002464304326856159
0.006389776357827476
4.285253227221984e-05
8.500628832784739e-05


In [10]:
gnv_map = df_merged_eval['get_nested_values_average_precision'].mean()
cross_map = df_merged_eval['cross_average_precision'].mean()
id_map = df_merged_eval['id_average_precision'].mean()

print(gnv_map)
print(cross_map)
print(id_map)

0.000356735740227207
0.0007430034287544037
3.925278995766968e-05


In [26]:
nltk.download("punkt", quiet=True)
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")
meteor_metric = evaluate.load("meteor")

decoded_labels = df_merged_eval['target_raw'].map(lambda x: "\n".join(nltk.sent_tokenize(x.strip()))).to_list()
decoded_preds = df_merged_eval['output'].map(lambda x: "\n".join(nltk.sent_tokenize(x.strip()))).to_list()

rouge_dict = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
# bleu_score = corpus_bleu([[x.split()] for x in df['target_template']], [x.split() for x in df['translated_prompt']])
bleu_dict = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
# meteor_dict = corpus_meteor(hypotheses=decoded_preds, references=decoded_labels)
meteor_dict = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)
correct_syntax = sum(list(map(lambda y: int(y[1]), df.apply(lambda x: is_correct_SPARQL_query(x['translated_prompt']), axis=1).items()))) / len(df)

[nltk_data] Downloading package wordnet to C:\Users\Alexis
[nltk_data]     Strappazzon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Alexis
[nltk_data]     Strappazzon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Alexis
[nltk_data]     Strappazzon\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [42]:
serie = pd.Series(data=
    {
        "model_name": args.model,
        "num_rows": len(df),
        "num_gen_fail": len(df.loc[df['has_error'] == True]),
        "num_exec_timeout": len(df_exec_timeout),
        "num_exec_fail": len(df_exec_fail),
        "num_exec_empty": len(df_exec_empty),
        "num_exec_to_eval": len(df_exec_to_eval),
        "num_eval": len(df_eval),
        "num_eval_empty": len(df_eval.loc[df_eval['eval'].map(len) == 0]),
        "gold_num_rows": len(df_gold_eval),
        "gold_num_exec_timeout": len(df_gold_exec_timeout),
        "gold_num_exec_fail": len(df_gold_exec_fail),
        "gold_num_exec_empty": len(df_gold_exec_empty),
        "gold_num_exec_to_eval": len(df_gold_exec_to_eval),
        "gold_num_eval_empty": len(df_gold_eval.loc[df_gold_eval['gold_eval'].map(len) == 0]),
        "bleu_score": bleu_dict["bleu"],
        "meteor_score": meteor_dict['meteor'],
        **rouge_dict,
        "get_nested_values_precision": gnv_precision,
        "get_nested_values_recall": gnv_recall,
        "get_nested_values_f1score": gnv_fscore,
        "get_nested_values_mean_average_precision": gnv_map,
        "id_precision": id_precision,
        "id_recall": id_recall,
        "id_f1score": id_fscore,
        "id_mean_average_precision": id_map,
        "cross_precision": cross_precision,
        "cross_recall": cross_recall,
        "cross_f1score": cross_fscore,
        "cross_mean_average_precision": cross_map,
        "correct_syntax": correct_syntax,
    })

In [43]:
serie

model_name                                  Mistral-7B-Instruct-v0.2
num_rows                                                         426
num_gen_fail                                                      69
num_exec_timeout                                                  22
num_exec_fail                                                     91
num_exec_empty                                                     0
num_exec_to_eval                                                 313
num_eval                                                         313
num_eval_empty                                                   294
gold_num_rows                                                    382
gold_num_exec_timeout                                              0
gold_num_exec_fail                                                 0
gold_num_exec_empty                                                0
gold_num_exec_to_eval                                            382
gold_num_eval_empty               