In [1]:
import sys
from pathlib import Path
sys.path.append(Path("../modules").absolute().__str__())

from data_utils import eval_dataset, get_nested_values, load_dataset, safe_loc, series_or_dataframe_to_list, make_dataframe_from_sparql_response
from evaluation_utils import compute_precision, compute_recall, corpus_meteor, average_precision, is_correct_SPARQL_query, keep_id_columns, cross_product_func, precision_recall_fscore_support_wrapper, average_precision_wrapper
from nltk.translate.bleu_score import corpus_bleu
from sklearn.metrics import f1_score, precision_recall_fscore_support, average_precision_score
from sklearn.preprocessing import MultiLabelBinarizer
import argparse
import json
import logging
import nltk
import numpy as np
import os
import pandas as pd
import warnings

In [2]:
arguments = {
    "dataset": "../outputs/batch_run/faith_template5/execution/Mistral-7B-Instruct-v0.2_rv16-ld0.05-bs1-p0-nta1-e3-template_engpeft-t0.2-topp0.95_executed.parquet.gzip",
    "preprocess_gold": "../outputs/preprocessed_gold.json",
    "model": "Mistral-7B-Instruct-v0.2",
    "output": ".",
    "save_name": "test",
    "log_level": "warning",
    "log_file": "",
}

In [3]:
args = argparse.Namespace()
args.__dict__.update(arguments)
print(args)

numeric_log_level = getattr(logging, args.log_level.upper(), None)
if not isinstance(numeric_log_level, int):
    raise ValueError(f"Invalid log level: {args.log_level}.")
logging.basicConfig(filename=args.log_file if args.log_file else None, level=numeric_log_level)

if not os.path.exists(args.dataset):
    raise FileNotFoundError(f"The dataset file not found with path: {args.dataset}")

if args.preprocess_gold != None and not os.path.exists(args.preprocess_gold):
    raise FileNotFoundError(f"The preprocess gold dataset file not found with path: {args.preprocess_gold}")

nltk.download('wordnet', quiet=True)



True

In [4]:
df = load_dataset(args.dataset)
df_no_gen_fail = df # df.drop(failed_generation_index(df))
df_exec_timeout = df_no_gen_fail.loc[df_no_gen_fail['execution'] == 'timeout']
df_exec_fail = df_no_gen_fail.loc[df_no_gen_fail['execution'].str.startswith('exception')]
df_exec_empty = df_no_gen_fail.loc[df_no_gen_fail['execution'].isnull()]
df_exec_to_eval = df_no_gen_fail.drop(df_exec_timeout.index).drop(df_exec_fail.index).drop(df_exec_empty.index)
df_eval = eval_dataset(df_exec_to_eval)
df_eval['get_nested_values'] = df_eval.apply(lambda x: get_nested_values(x['eval']), axis=1)
df_eval['eval_df'] = df_eval.apply(lambda x: make_dataframe_from_sparql_response(x['eval']), axis=1)
df_eval['id_columns'] = df_eval.apply(lambda x: keep_id_columns(x['eval_df']), axis=1)

In [5]:
df_gold_eval = None
# if args.gold != None:
#     df_gold = load_dataset(args.gold)
#     df_gold_exec_timeout = df_gold.loc[df_gold['execution'] == 'timeout']
#     df_gold_exec_fail = df_gold.loc[df_gold['execution'].str.startswith('exception')]
#     df_gold_exec_empty = df_gold.loc[df_gold['execution'].isnull()]
#     df_gold_exec_to_eval = df_gold.drop(df_gold_exec_timeout.index).drop(df_gold_exec_fail.index).drop(df_gold_exec_empty.index)
#     df_gold_eval = eval_dataset(df_gold_exec_to_eval, "gold_eval")
#     df_gold_eval['gold_get_nested_values'] = df_gold_eval.apply(lambda x: get_nested_values(x['gold_eval']), axis=1)
# else:
with open(args.preprocess_gold, "r") as f:
    data = json.load(f)
df_gold_eval = pd.read_json(data['df_gold_eval'])
df_gold_eval['gold_eval_df'] = df_gold_eval.apply(lambda x: make_dataframe_from_sparql_response(x['gold_eval']), axis=1)
df_gold_eval['gold_id_columns'] = df_gold_eval.apply(lambda x: keep_id_columns(x['gold_eval_df']), axis=1)

In [6]:
df_merged_eval = df_eval.copy()

# Merging manually
df_merged_eval["gold_eval"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval", default=None), axis=1)
df_merged_eval["gold_get_nested_values"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_get_nested_values", default=[]), axis=1)
df_merged_eval["gold_eval_df"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval_df", default=pd.DataFrame()), axis=1)
df_merged_eval["gold_id_columns"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_id_columns", default=pd.DataFrame()), axis=1)

In [7]:
# Computing metrics using scikit-learn

df_merged_eval['get_nested_values_precision_recall_fscore'] = df_merged_eval.apply(lambda x: precision_recall_fscore_support_wrapper(
    x['gold_get_nested_values'],
    x['get_nested_values']
), axis=1)

df_merged_eval['cross_precision_recall_fscore'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=precision_recall_fscore_support_wrapper,
    y_true=x['gold_eval_df'].apply(lambda y: y.fillna(value="")),
    y_pred=x['eval_df'].apply(lambda y: y.fillna(value="")),
    maximization=True,
    use_binarizer=True,
    average="samples"
)
, axis=1)

df_merged_eval['id_precision_recall_fscore'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=precision_recall_fscore_support_wrapper,
    y_true=x['gold_id_columns'].apply(lambda y: y.fillna(value="")),
    y_pred=x['id_columns'].apply(lambda y: y.fillna(value="")),
    maximization=True,
    use_binarizer=True,
    average="samples"
)
, axis=1)

In [22]:
df_merged_eval['cross_mean_average_precision'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=average_precision_wrapper,
    y_true=x['gold_eval_df'].apply(lambda y: y.fillna(value="")),
    y_pred=x['eval_df'].apply(lambda y: y.fillna(value="")),
    maximization=True,
    use_binarizer=True,
    average="macro"
)
, axis=1)

df_merged_eval['id_mean_average_precision'] = df_merged_eval.apply(lambda x: cross_product_func(
    func=average_precision_wrapper,
    y_true=x['gold_id_columns'].apply(lambda y: y.fillna(value="")),
    y_pred=x['id_columns'].apply(lambda y: y.fillna(value="")),
    maximization=True,
    use_binarizer=True,
    average="macro"
)
, axis=1)

In [25]:
entry = df_merged_eval.iloc[0]
average_precision_score(
    y_true = [entry['gold_id_columns']],
    y_score = [entry['id_columns']],
    average='samples'
)

ValueError: unknown format is not supported

In [23]:
df_merged_eval['cross_mean_average_precision'].value_counts()

cross_mean_average_precision
0.0    129
1.0    119
Name: count, dtype: int64

In [9]:
gnv_precision = np.array([r[0] if isinstance(r, tuple) else 0 for r in df_merged_eval['get_nested_values_precision_recall_fscore'].to_list()])
gnv_recall = np.array([r[1] if isinstance(r, tuple) else 0 for r in df_merged_eval['get_nested_values_precision_recall_fscore'].to_list()])
gnv_fscore = np.array([r[2] if isinstance(r, tuple) else 0 for r in df_merged_eval['get_nested_values_precision_recall_fscore'].to_list()])

gnv_prec = gnv_precision.mean()
gnv_rec = gnv_recall.mean()
gnv_fsc = gnv_fscore.mean()

print(gnv_prec)
print(gnv_rec)
print(gnv_fsc)

cross_precision = np.array([r[0] if isinstance(r, tuple) else 0 for r in df_merged_eval['cross_precision_recall_fscore'].to_list()])
cross_recall = np.array([r[1] if isinstance(r, tuple) else 0 for r in df_merged_eval['cross_precision_recall_fscore'].to_list()])
cross_fscore = np.array([r[2] if isinstance(r, tuple) else 0 for r in df_merged_eval['cross_precision_recall_fscore'].to_list()])

cross_prec = cross_precision.mean()
cross_rec = cross_recall.mean()
cross_fsc = cross_fscore.mean()

print(cross_prec)
print(cross_rec)
print(cross_fsc)

id_precision = np.array([r[0] if isinstance(r, tuple) else 0 for r in df_merged_eval['id_precision_recall_fscore'].to_list()])
id_recall = np.array([r[1] if isinstance(r, tuple) else 0 for r in df_merged_eval['id_precision_recall_fscore'].to_list()])
id_fscore = np.array([r[2] if isinstance(r, tuple) else 0 for r in df_merged_eval['id_precision_recall_fscore'].to_list()])

id_prec = id_precision.mean()
id_rec = id_recall.mean()
id_fsc = id_fscore.mean()

print(id_prec)
print(id_rec)
print(id_fsc)

0.20161290322580644
0.08576367429737422
0.10232636509442887
0.2217741935483871
0.11874501879595092
0.13573364312221764
0.16129032258064516
0.08542306238329235
0.0958221468492116


In [10]:
bleu_score = corpus_bleu([[x.split()] for x in df_no_gen_fail['target_template']], [x.split() for x in df_no_gen_fail['translated_prompt']])
meteor_score = corpus_meteor(df_no_gen_fail['target_template'], df_no_gen_fail['translated_prompt'])
correct_syntax = sum(list(map(lambda y: int(y[1]), df_no_gen_fail.apply(lambda x: is_correct_SPARQL_query(x['translated_prompt']), axis=1).items()))) / len(df_no_gen_fail)

In [11]:
serie = pd.Series(data=
    {
        "model_name": args.model,
        "num_rows": len(df),
        "num_gen_fail": len(df.loc[df['has_error'] == True]),
        "num_exec_timeout": len(df_exec_timeout),
        "num_exec_fail": len(df_exec_fail),
        "num_exec_empty": len(df_exec_empty),
        "num_exec_to_eval": len(df_exec_to_eval),
        "num_eval": len(df_eval),
        "num_eval_empty": len(df_eval.loc[df_eval['eval'].map(len) == 0]),
        "bleu_score": bleu_score,
        "meteor_score": meteor_score,
        "precision": gnv_prec,
        "recall": gnv_rec,
        "f1score": gnv_fsc,
        "mean_average_precision": mean_average_precision,
        "id_precision": id_prec,
        "id_recall": id_rec,
        "id_f1score": id_fsc,
        "mean_id_average_precision": mean_id_average_precision,
        "cross_precision": cross_prec,
        "cross_recall": cross_rec,
        "cross_f1score": cross_fsc,
        "mean_cross_average_precision": mean_cross_average_precision,
        "correct_syntax": correct_syntax,
    })

NameError: name 'mean_average_precision' is not defined

In [None]:
serie

model_name                      Mistral-7B-Instruct-v0.2
num_rows                                             511
num_gen_fail                                         139
num_exec_timeout                                      17
num_exec_fail                                        246
num_exec_empty                                         0
num_exec_to_eval                                     248
num_eval                                             248
num_eval_empty                                       125
bleu_score                                       0.03689
meteor_score                                    0.240151
precision                                       0.092549
recall                                          0.085764
f1score                                         0.089027
mean_average_precision                          0.069879
id_precision                                    0.092603
id_recall                                       0.085121
id_f1score                     

In [None]:
df_merged_eval[['eval_df', 'gold_eval_df']].loc[df_merged_eval['precision'] > df_merged_eval['cross_precision']].head()

Unnamed: 0_level_0,eval_df,gold_eval_df
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1223,probe \ 0 ...,item ...
1860,diseaseLabel 0 influenz...,diseaseLabel numOfCases 0 ...
1949,person ...,item...
