In [41]:
import sys
from pathlib import Path
sys.path.append(Path("../modules").absolute().__str__())

from nltk.translate.bleu_score import corpus_bleu
import argparse
import json
import logging
import nltk
import os
import pandas as pd
from evaluation_utils import compute_precision, compute_recall, corpus_meteor, average_precision, is_correct_SPARQL_query, find_id_column, cross_product_func
from data_utils import eval_dataset, get_nested_values, load_dataset, safe_loc, series_or_dataframe_to_list, make_dataframe_from_sparql_response

from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import warnings

In [2]:
arguments = {
    "dataset": "../outputs/batch_run/faith_template5/execution/Mistral-7B-Instruct-v0.2_rv16-ld0.05-bs1-p0-nta1-e3-template_engpeft-t0.2-topp0.95_executed.parquet.gzip",
    "preprocess_gold": "../outputs/preprocessed_gold.json",
    "model": "Mistral-7B-Instruct-v0.2",
    "output": ".",
    "save_name": "test",
    "log_level": "warning",
    "log_file": "",
}

In [3]:
args = argparse.Namespace()
args.__dict__.update(arguments)
print(args)

numeric_log_level = getattr(logging, args.log_level.upper(), None)
if not isinstance(numeric_log_level, int):
    raise ValueError(f"Invalid log level: {args.log_level}.")
logging.basicConfig(filename=args.log_file if args.log_file else None, level=numeric_log_level)

if not os.path.exists(args.dataset):
    raise FileNotFoundError(f"The dataset file not found with path: {args.dataset}")

if args.preprocess_gold != None and not os.path.exists(args.preprocess_gold):
    raise FileNotFoundError(f"The preprocess gold dataset file not found with path: {args.preprocess_gold}")

nltk.download('wordnet', quiet=True)



True

In [4]:
df = load_dataset(args.dataset)
df_no_gen_fail = df # df.drop(failed_generation_index(df))
df_exec_timeout = df_no_gen_fail.loc[df_no_gen_fail['execution'] == 'timeout']
df_exec_fail = df_no_gen_fail.loc[df_no_gen_fail['execution'].str.startswith('exception')]
df_exec_empty = df_no_gen_fail.loc[df_no_gen_fail['execution'].isnull()]
df_exec_to_eval = df_no_gen_fail.drop(df_exec_timeout.index).drop(df_exec_fail.index).drop(df_exec_empty.index)
df_eval = eval_dataset(df_exec_to_eval)
df_eval['get_nested_values'] = df_eval.apply(lambda x: get_nested_values(x['eval']), axis=1)
df_eval['eval_df'] = df_eval.apply(lambda x: make_dataframe_from_sparql_response(x['eval']), axis=1)
df_eval['id_column'] = df_eval.apply(lambda x: x['eval_df'][find_id_column(x['eval_df'])] if find_id_column(x['eval_df']) != None else None, axis=1)

In [5]:
df_gold_eval = None
# if args.gold != None:
#     df_gold = load_dataset(args.gold)
#     df_gold_exec_timeout = df_gold.loc[df_gold['execution'] == 'timeout']
#     df_gold_exec_fail = df_gold.loc[df_gold['execution'].str.startswith('exception')]
#     df_gold_exec_empty = df_gold.loc[df_gold['execution'].isnull()]
#     df_gold_exec_to_eval = df_gold.drop(df_gold_exec_timeout.index).drop(df_gold_exec_fail.index).drop(df_gold_exec_empty.index)
#     df_gold_eval = eval_dataset(df_gold_exec_to_eval, "gold_eval")
#     df_gold_eval['gold_get_nested_values'] = df_gold_eval.apply(lambda x: get_nested_values(x['gold_eval']), axis=1)
# else:
with open(args.preprocess_gold, "r") as f:
    data = json.load(f)
df_gold_eval = pd.read_json(data['df_gold_eval'])
df_gold_eval['gold_eval_df'] = df_gold_eval.apply(lambda x: make_dataframe_from_sparql_response(x['gold_eval']), axis=1)
df_gold_eval['gold_id_column'] = df_gold_eval.apply(lambda x: pd.DataFrame(data=x['gold_eval_df'][find_id_column(x['gold_eval_df'])]) if find_id_column(x['gold_eval_df']) != None else None, axis=1)

In [6]:
df_merged_eval = df_eval.copy()

# Merging manually
df_merged_eval["gold_eval"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval", default=None), axis=1)
df_merged_eval["gold_get_nested_values"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_get_nested_values", default=[]), axis=1)
df_merged_eval["gold_eval_df"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval_df", default=pd.DataFrame()), axis=1)
df_merged_eval["gold_id_column"] = df_merged_eval.apply(lambda x: series_or_dataframe_to_list(pd.DataFrame(data=safe_loc(x, df_gold_eval, "gold_id_column", default=None, as_serie=True))), axis=1)

In [46]:
# Computing metrics using scikit-learn

df_merged_eval['get_nested_values_precision_recall_fscore'] = df_merged_eval.apply(lambda x: precision_recall_fscore_support_wrapper(
    x['gold_get_nested_values'],
    x['get_nested_values']
), axis=1)

df_merged_eval['cross_precision_recall_fscore'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=precision_recall_fscore_support,
    y_true=x['gold_eval_df'].apply(lambda y: [y.fillna(value="").to_list()]),
    y_pred=x['eval_df'].apply(lambda y: [y.fillna(value="").to_list()]),
    maximization=True,
    use_binarizer=True,
    average="samples"
)
, axis=1)

In [15]:
# Computing metrics
df_merged_eval["precision"] = df_merged_eval.apply(lambda x: compute_precision(x['get_nested_values'], x['gold_get_nested_values']), axis=1)
df_merged_eval["recall"] = df_merged_eval.apply(lambda x: compute_recall(x['get_nested_values'], x['gold_get_nested_values']), axis=1)
df_merged_eval["average_precision"] = df_merged_eval.apply(lambda x: average_precision(x['get_nested_values'], x['gold_get_nested_values'], k_max=100000), axis=1)

df_merged_eval["id_precision"] = df_merged_eval.apply(lambda x: compute_precision(series_or_dataframe_to_list(x['id_column']), series_or_dataframe_to_list(x['gold_id_column'])), axis=1)
df_merged_eval["id_recall"] = df_merged_eval.apply(lambda x: compute_recall(series_or_dataframe_to_list(x['id_column']), series_or_dataframe_to_list(x['gold_id_column'])), axis=1)
df_merged_eval["id_average_precision"] = df_merged_eval.apply(lambda x: average_precision(series_or_dataframe_to_list(x['id_column']), series_or_dataframe_to_list(x['gold_id_column']), k_max=100000), axis=1)

df_merged_eval["cross_precision"] = df_merged_eval.apply(lambda x: cross_product_func(compute_precision, x['eval_df'], x['gold_eval_df'], maximization=True), axis=1)
df_merged_eval["cross_recall"] = df_merged_eval.apply(lambda x: cross_product_func(compute_recall, x['eval_df'], x['gold_eval_df'], maximization=True), axis=1)
df_merged_eval["cross_average_precision"] = df_merged_eval.apply(lambda x: cross_product_func(average_precision, x['eval_df'], x['gold_eval_df'], maximization=True, k_max=100000), axis=1)

In [16]:
m_precision = df_merged_eval['precision'].mean()
m_recall = df_merged_eval['recall'].mean()
m_fscore = 2*m_precision*m_recall/(m_precision+m_recall)

id_precision = df_merged_eval['id_precision'].mean()
id_recall = df_merged_eval['id_recall'].mean()
id_fscore = 2*id_precision*id_recall/(id_precision+id_recall)

cross_precision = df_merged_eval['cross_precision'].mean()
cross_recall = df_merged_eval['cross_recall'].mean()
cross_fscore = 2*cross_precision*cross_recall/(cross_precision+cross_recall)

mean_average_precision = df_merged_eval['average_precision'].mean()
mean_id_average_precision = df_merged_eval['id_average_precision'].mean()
mean_cross_average_precision = df_merged_eval['cross_average_precision'].mean()

In [55]:
gnv_precision = [r[0] if isinstance(r, tuple) else 0 for r in df_merged_eval['get_nested_values_precision_recall_fscore'].to_list()]
gnv_recall = [r[1] if isinstance(r, tuple) else 0 for r in df_merged_eval['get_nested_values_precision_recall_fscore'].to_list()]
gnv_fscore = [r[2] if isinstance(r, tuple) else 0 for r in df_merged_eval['get_nested_values_precision_recall_fscore'].to_list()]

In [56]:
gnv_prec = sum(gnv_precision) / len(gnv_precision)
gnv_rec = sum(gnv_recall) / len(gnv_recall)
gnv_fsc = sum(gnv_fscore) / len(gnv_fscore)

print(gnv_prec)
print(gnv_rec)
print(gnv_fsc)

0.20161290322580644
0.08576367429737423
0.10232636509442887


In [17]:
bleu_score = corpus_bleu([[x.split()] for x in df_no_gen_fail['target_template']], [x.split() for x in df_no_gen_fail['translated_prompt']])
meteor_score = corpus_meteor(df_no_gen_fail['target_template'], df_no_gen_fail['translated_prompt'])
correct_syntax = sum(list(map(lambda y: int(y[1]), df_no_gen_fail.apply(lambda x: is_correct_SPARQL_query(x['translated_prompt']), axis=1).items()))) / len(df_no_gen_fail)

In [18]:
serie = pd.Series(data=
    {
        "model_name": args.model,
        "num_rows": len(df),
        "num_gen_fail": len(df.loc[df['has_error'] == True]),
        "num_exec_timeout": len(df_exec_timeout),
        "num_exec_fail": len(df_exec_fail),
        "num_exec_empty": len(df_exec_empty),
        "num_exec_to_eval": len(df_exec_to_eval),
        "num_eval": len(df_eval),
        "num_eval_empty": len(df_eval.loc[df_eval['eval'].map(len) == 0]),
        "bleu_score": bleu_score,
        "meteor_score": meteor_score,
        "precision": m_precision,
        "recall": m_recall,
        "f1score": m_fscore,
        "mean_average_precision": mean_average_precision,
        "id_precision": id_precision,
        "id_recall": id_recall,
        "id_f1score": id_fscore,
        "mean_id_average_precision": mean_id_average_precision,
        "cross_precision": cross_precision,
        "cross_recall": cross_recall,
        "cross_f1score": cross_fscore,
        "mean_cross_average_precision": mean_cross_average_precision,
        "correct_syntax": correct_syntax,
    })

In [19]:
serie

model_name                      Mistral-7B-Instruct-v0.2
num_rows                                             511
num_gen_fail                                         139
num_exec_timeout                                      17
num_exec_fail                                        246
num_exec_empty                                         0
num_exec_to_eval                                     248
num_eval                                             248
num_eval_empty                                       125
bleu_score                                       0.03689
meteor_score                                    0.240151
precision                                       0.092549
recall                                          0.085764
f1score                                         0.089027
mean_average_precision                          0.069879
id_precision                                    0.090183
id_recall                                       0.085074
id_f1score                     

In [20]:
df_merged_eval[['eval_df', 'gold_eval_df']].loc[df_merged_eval['precision'] > df_merged_eval['cross_precision']].head()

Unnamed: 0_level_0,eval_df,gold_eval_df
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1223,probe \ 0 ...,item ...
1860,diseaseLabel 0 influenz...,diseaseLabel numOfCases 0 ...
1949,person ...,item...
