In [1]:
import sys
from pathlib import Path
sys.path.append(Path("../modules").absolute().__str__())

from nltk.translate.bleu_score import corpus_bleu
import argparse
import json
import logging
import nltk
import os
import pandas as pd
from evaluation_utils import compute_precision, compute_recall, corpus_meteor, average_precision, is_correct_SPARQL_query
from data_utils import eval_dataset, get_nested_values, load_dataset, safe_loc, series_or_dataframe_to_list

In [2]:
def get_value_from_sparql_key(data_dict):
    if "type" in data_dict.keys():
        ttype = data_dict["type"]
        
        if ttype == "uri":
            return data_dict["value"]
        
        elif ttype == "literal":
            if "datatype" in data_dict.keys() and data_dict['datatype'] == 'http://www.w3.org/2001/XMLSchema#integer':
                return int(data_dict["value"])
            
            return data_dict["value"]
        
        raise NotImplementedError(f"This type was not implemented, found: {ttype}.")
        
    raise ValueError("Type is not in data_dict.")

def get_columns_from_sparql_response(response):
    if not isinstance(response, list):
        raise Exception("The response needs to be evaluated or at least a List.")
    
    if len(response) == 0:
        return None
    
    return list(response[0].keys())

def make_dataframe_from_sparql_response(response):
    columns = get_columns_from_sparql_response(response)
    
    if columns == None:
        return pd.DataFrame()
    
    df = {k:[] for k in columns}
    
    for row in response:
        for k in df.keys():
            if k in row.keys():
                data = get_value_from_sparql_key(row[k])
            else:
                data = None
            df[k].append(data)
    
    return pd.DataFrame(data=df)

def unique_metric(column: pd.Series):
    return len(column.unique())/len(column)

def is_entity_column(column: pd.Series):
    if not isinstance(column[0], str):
        return False
    return all(column.str.lower().str.startswith("http://www.wikidata.org/entity/"))

def find_id_column(response_df):
    if not isinstance(response_df, pd.DataFrame):
        raise TypeError("response_df must be a pandas DataFrame.")
    
    if response_df.empty:
        return None
    
    potential_id_columns = response_df.columns
    
    if len(potential_id_columns) == 1:
        return potential_id_columns[0]
    
    unique_scores = [(column, unique_metric(response_df[column])) for column in response_df.columns]
    unique_scores.sort(key=lambda x: x[1], reverse=True)
    
    potential_id_columns = list(map(lambda x: x[0], filter(lambda x: x[1] == unique_scores[0][1], unique_scores)))
    
    if len(potential_id_columns) == 1:
        return potential_id_columns[0]
    
    potential_id_columns_with_id = list(filter(lambda x: x.lower().startswith('id') or x.lower().endswith('id'), potential_id_columns))
    if len(potential_id_columns_with_id) > 0:
        potential_id_columns = potential_id_columns_with_id
    
    if len(potential_id_columns) == 1:
        return potential_id_columns[0]
    
    entity_columns = list(filter(lambda x: is_entity_column(response_df[x]), potential_id_columns))
    if len(entity_columns) > 0:
        potential_id_columns = entity_columns
    
    return potential_id_columns[0]

# def get_id_column_values
    

In [3]:
arguments = {
    "dataset": "../datasets/Mistral-7B-Instruct-v0.2_rv16-ld0-bs1-p0-nta0-e3-basic_engpeft-t0.2-topp0.95_executed.parquet.gzip",
    "preprocess_gold": "../datasets/preprocessed_gold.json",
    "model": "Mistral-7B-Instruct-v0.2",
    "output": ".",
    "save_name": "test",
    "log_level": "warning",
    "log_file": "",
}

In [4]:
args = argparse.Namespace()
args.__dict__.update(arguments)
print(args)

numeric_log_level = getattr(logging, args.log_level.upper(), None)
if not isinstance(numeric_log_level, int):
    raise ValueError(f"Invalid log level: {args.log_level}.")
logging.basicConfig(filename=args.log_file if args.log_file else None, level=numeric_log_level)

if not os.path.exists(args.dataset):
    raise FileNotFoundError(f"The dataset file not found with path: {args.dataset}")

if args.preprocess_gold != None and not os.path.exists(args.preprocess_gold):
    raise FileNotFoundError(f"The preprocess gold dataset file not found with path: {args.preprocess_gold}")

nltk.download('wordnet', quiet=True)



True

In [5]:
df = load_dataset(args.dataset)
df_no_gen_fail = df # df.drop(failed_generation_index(df))
df_exec_timeout = df_no_gen_fail.loc[df_no_gen_fail['execution'] == 'timeout']
df_exec_fail = df_no_gen_fail.loc[df_no_gen_fail['execution'].str.startswith('exception')]
df_exec_empty = df_no_gen_fail.loc[df_no_gen_fail['execution'].isnull()]
df_exec_to_eval = df_no_gen_fail.drop(df_exec_timeout.index).drop(df_exec_fail.index).drop(df_exec_empty.index)
df_eval = eval_dataset(df_exec_to_eval)
df_eval['get_nested_values'] = df_eval.apply(lambda x: get_nested_values(x['eval']), axis=1)
df_eval['eval_df'] = df_eval.apply(lambda x: make_dataframe_from_sparql_response(x['eval']), axis=1)
df_eval['id_column'] = df_eval.apply(lambda x: x['eval_df'][find_id_column(x['eval_df'])] if find_id_column(x['eval_df']) != None else None, axis=1)

In [6]:
df_gold_eval = None
# if args.gold != None:
#     df_gold = load_dataset(args.gold)
#     df_gold_exec_timeout = df_gold.loc[df_gold['execution'] == 'timeout']
#     df_gold_exec_fail = df_gold.loc[df_gold['execution'].str.startswith('exception')]
#     df_gold_exec_empty = df_gold.loc[df_gold['execution'].isnull()]
#     df_gold_exec_to_eval = df_gold.drop(df_gold_exec_timeout.index).drop(df_gold_exec_fail.index).drop(df_gold_exec_empty.index)
#     df_gold_eval = eval_dataset(df_gold_exec_to_eval, "gold_eval")
#     df_gold_eval['gold_get_nested_values'] = df_gold_eval.apply(lambda x: get_nested_values(x['gold_eval']), axis=1)
# else:
with open(args.preprocess_gold, "r") as f:
    data = json.load(f)
df_gold_eval = pd.read_json(data['df_gold_eval'])
df_gold_eval['gold_eval_df'] = df_gold_eval.apply(lambda x: make_dataframe_from_sparql_response(x['gold_eval']), axis=1)
df_gold_eval['gold_id_column'] = df_gold_eval.apply(lambda x: x['gold_eval_df'][find_id_column(x['gold_eval_df'])] if find_id_column(x['gold_eval_df']) != None else None, axis=1)

In [7]:
df_merged_eval = df_eval.copy()

# Merging manually
df_merged_eval["gold_eval"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval", default=None), axis=1)
df_merged_eval["gold_get_nested_values"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_get_nested_values", default=[]), axis=1)
df_merged_eval["gold_eval_df"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval_df", default=pd.DataFrame), axis=1)
df_merged_eval["gold_id_column"] = df_merged_eval.apply(lambda x: series_or_dataframe_to_list(pd.DataFrame(data=safe_loc(x, df_gold_eval, "gold_id_column", default=None, as_serie=True))), axis=1)

In [8]:
# Computing metrics
df_merged_eval["precision"] = df_merged_eval.apply(lambda x: compute_precision(x['get_nested_values'], x['gold_get_nested_values']), axis=1)
df_merged_eval["recall"] = df_merged_eval.apply(lambda x: compute_recall(x['get_nested_values'], x['gold_get_nested_values']), axis=1)
df_merged_eval["average_precision"] = df_merged_eval.apply(lambda x: average_precision(x['get_nested_values'], x['gold_get_nested_values'], k_max=100000), axis=1)

df_merged_eval["id_precision"] = df_merged_eval.apply(lambda x: compute_precision(series_or_dataframe_to_list(x['id_column']), series_or_dataframe_to_list(x['gold_id_column'])), axis=1)
df_merged_eval["id_recall"] = df_merged_eval.apply(lambda x: compute_recall(series_or_dataframe_to_list(x['id_column']), series_or_dataframe_to_list(x['gold_id_column'])), axis=1)
df_merged_eval["id_average_precision"] = df_merged_eval.apply(lambda x: average_precision(series_or_dataframe_to_list(x['id_column']), series_or_dataframe_to_list(x['gold_id_column']), k_max=100000), axis=1)

In [9]:
m_precision = df_merged_eval['precision'].mean()
m_recall = df_merged_eval['recall'].mean()
m_fscore = 2*m_precision*m_recall/(m_precision+m_recall)

id_precision = df_merged_eval['id_precision'].mean()
id_recall = df_merged_eval['id_recall'].mean()
id_fscore = 2*id_precision*id_recall/(id_precision+id_recall)

mean_average_precision = df_merged_eval['average_precision'].mean()
mean_id_average_precision = df_merged_eval['id_average_precision'].mean()

In [10]:
bleu_score = corpus_bleu([[x.split()] for x in df_no_gen_fail['target_template']], [x.split() for x in df_no_gen_fail['translated_prompt']])
meteor_score = corpus_meteor(df_no_gen_fail['target_template'], df_no_gen_fail['translated_prompt'])
correct_syntax = sum(list(map(lambda y: int(y[1]), df_no_gen_fail.apply(lambda x: is_correct_SPARQL_query(x['translated_prompt']), axis=1).items()))) / len(df_no_gen_fail)

In [11]:
serie = pd.Series(data=
    {
        "model_name": args.model,
        "num_rows": len(df),
        "num_gen_fail": len(df.loc[df['has_error'] == True]),
        "num_exec_timeout": len(df_exec_timeout),
        "num_exec_fail": len(df_exec_fail),
        "num_exec_empty": len(df_exec_empty),
        "num_exec_to_eval": len(df_exec_to_eval),
        "num_eval": len(df_eval),
        "num_eval_empty": len(df_eval.loc[df_eval['eval'].map(len) == 0]),
        "bleu_score": bleu_score,
        "meteor_score": meteor_score,
        "precision": m_precision,
        "recall": m_recall,
        "f1score": m_fscore,
        "mean_average_precision": mean_average_precision,
        "id_precision": id_precision,
        "id_recall": id_recall,
        "id_f1score": id_fscore,
        "mean_id_average_precision": mean_id_average_precision,
        "correct_syntax": correct_syntax,
    })

In [12]:
serie

model_name                   Mistral-7B-Instruct-v0.2
num_rows                                          426
num_gen_fail                                       69
num_exec_timeout                                   22
num_exec_fail                                      91
num_exec_empty                                      0
num_exec_to_eval                                  313
num_eval                                          313
num_eval_empty                                    294
bleu_score                                   0.009727
meteor_score                                 0.147948
precision                                    0.651757
recall                                       0.651757
f1score                                      0.651757
mean_average_precision                            0.0
id_precision                                 0.651757
id_recall                                    0.651757
id_f1score                                   0.651757
mean_id_average_precision   