In [1]:
from nltk.translate.bleu_score import corpus_bleu
import argparse
import json
import logging
import nltk
import os
# %load_ext cudf.pandas
import pandas as pd
from evaluation_utils import failed_generation_index, eval_dataset, get_nested_values, load_dataset, safe_loc, compute_precision, compute_recall, corpus_meteor, average_precision

In [2]:
arguments = {
    "dataset": "./Mistral-7B-Instruct-v0.2_rv4-ld0-bs2-p0-nta0-e1_engpeft-pipbasic-t0.2-topp0.95_executed.parquet.gzip",
    "preprocess_gold": "./preprocessed_gold.json",
    "model": "Mistral-7B-Instruct-v0.2",
    "output": ".",
    "save_name": "test",
    "log_level": "warning",
    "log_file": "",
}

In [3]:
args = argparse.Namespace()
args.__dict__.update(arguments)
print(args)

numeric_log_level = getattr(logging, args.log_level.upper(), None)
if not isinstance(numeric_log_level, int):
    raise ValueError(f"Invalid log level: {args.log_level}.")
logging.basicConfig(filename=args.log_file if args.log_file else None, level=numeric_log_level)

if not os.path.exists(args.dataset):
    raise FileNotFoundError(f"The dataset file not found with path: {args.dataset}")

if args.preprocess_gold != None and not os.path.exists(args.preprocess_gold):
    raise FileNotFoundError(f"The preprocess gold dataset file not found with path: {args.preprocess_gold}")

nltk.download('wordnet', quiet=True)



True

In [4]:
df = load_dataset(args.dataset)
df_no_gen_fail = df # df.drop(failed_generation_index(df))
df_exec_timeout = df_no_gen_fail.loc[df_no_gen_fail['execution'] == 'timeout']
df_exec_fail = df_no_gen_fail.loc[df_no_gen_fail['execution'].str.startswith('exception')]
df_exec_empty = df_no_gen_fail.loc[df_no_gen_fail['execution'].isnull()]
df_exec_to_eval = df_no_gen_fail.drop(df_exec_timeout.index).drop(df_exec_fail.index).drop(df_exec_empty.index)
df_eval = eval_dataset(df_exec_to_eval)
df_eval['get_nested_values'] = df_eval.apply(lambda x: get_nested_values(x['eval']), axis=1)

In [5]:
# df_gold = None
# df_gold_exec_timeout = None
# df_gold_exec_fail = None
# df_gold_exec_empty = None
# df_gold_exec_to_eval = None
# df_gold_eval = None
# if args.gold != None:
#     df_gold = load_dataset(args.gold)
#     df_gold_exec_timeout = df_gold.loc[df_gold['execution'] == 'timeout']
#     df_gold_exec_fail = df_gold.loc[df_gold['execution'].str.startswith('exception')]
#     df_gold_exec_empty = df_gold.loc[df_gold['execution'].isnull()]
#     df_gold_exec_to_eval = df_gold.drop(df_gold_exec_timeout.index).drop(df_gold_exec_fail.index).drop(df_gold_exec_empty.index)
#     df_gold_eval = eval_dataset(df_gold_exec_to_eval, "gold_eval")
# else:
with open(args.preprocess_gold, "r") as f:
    data = json.load(f)
df_gold_eval = pd.read_json(data['df_gold_eval'])

In [6]:
df_gold_eval

Unnamed: 0,input,target_template,target_raw,execution,executed_query,gold_eval,gold_get_nested_values
2419,"[""\""Can you find me all the mayors who are any...",SELECT ?image ?speciesLabel ?mayorLabel ?place...,SELECT ?image ?speciesLabel ?mayorLabel ?place...,"[{'speciesLabel': {'xml:lang': 'en', 'type': '...",SELECT ?image ?speciesLabel ?mayorLabel ?place...,"[{'speciesLabel': {'xml:lang': 'en', 'type': '...","[dog, Bosco the dog, Sunol, dog, Duke the Dog,..."
2421,"[""\""Can you retrieve a map of the highest poin...",SELECT DISTINCT ?item ?itemLabel ?highestPoint...,SELECT DISTINCT ?item ?itemLabel ?highestPoint...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT DISTINCT ?item ?itemLabel ?highestPoint...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q107356467, ht..."
2423,"[""\""Can you write a SparQL query to generate a...",SELECT ?a $TIMES ?b $EQUALS ?ab WITH {\n# equi...,SELECT ?a $TIMES ?b $EQUALS ?ab WITH {\n# equi...,"[{'TIMES': {'type': 'literal', 'value': 'TIMES...",SELECT ?a $TIMES ?b $EQUALS ?ab WITH {\n# equi...,"[{'TIMES': {'type': 'literal', 'value': 'TIMES...","[TIMES, EQUALS, 1, 1, 1, TIMES, EQUALS, 2, 1, ..."
2425,"[""\""Can you retrieve a list of Ashmolean items...",SELECT ?item ?itemLabel ?inventory (GROUP_CONC...,SELECT ?item ?itemLabel ?inventory (GROUP_CONC...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel ?inventory (GROUP_CONC...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q50819525, Alb..."
2426,"[""\""Can you find all GLAMs with open access po...",SELECT ?item ?itemLabel ?coordinate_location ?...,SELECT ?item ?itemLabel ?coordinate_location ?...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel ?coordinate_location ?...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q422, Point(1...."
...,...,...,...,...,...,...,...
2840,"[""\""Can you retrieve a list of movies directed...",SELECT ?film ?filmLabel ?seriesLabel ?duration...,SELECT ?film ?filmLabel ?seriesLabel ?duration...,"[{'film': {'type': 'uri', 'value': 'http://www...",SELECT ?film ?filmLabel ?seriesLabel ?duration...,"[{'film': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q2085, Twin Pe..."
2841,"[""\""Can you find the rail link between Narvik,...",SELECT ?station ?stationLabel ?cds ?line ?laye...,SELECT ?station ?stationLabel ?cds ?line ?laye...,"[{'station': {'type': 'uri', 'value': 'http://...",SELECT ?station ?stationLabel ?cds ?line ?laye...,"[{'station': {'type': 'uri', 'value': 'http://...","[http://www.wikidata.org/entity/Q98908332, Q98..."
2842,"[""\""Can you retrieve a list of fictional sword...",SELECT ?item ?itemLabel ?ownerLabel ?workLabel...,SELECT ?item ?itemLabel ?ownerLabel ?workLabel...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel ?ownerLabel ?workLabel...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q118836476, Q1..."
2843,"[""\""Can you write a SparQL query to count the ...",SELECT ?partof (COUNT(*) as ?count)\n{\n?item ...,SELECT ?partof (COUNT(*) as ?count)\n{\n?item ...,"[{'partof': {'type': 'uri', 'value': 'https://...",SELECT ?partof (COUNT(*) as ?count)\n{\n?item ...,"[{'partof': {'type': 'uri', 'value': 'https://...","[https://kk.wikipedia.org/, 41942, https://lt...."


In [7]:
df_merged_eval = df_eval.copy()

# Merging manually
df_merged_eval["gold_eval"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval", default=None), axis=1)
df_merged_eval["gold_get_nested_values"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_get_nested_values", default=[]), axis=1)


In [8]:
# Computing metrics
df_merged_eval["precision"] = df_merged_eval.apply(lambda x: compute_precision(x['get_nested_values'], x['gold_get_nested_values']), axis=1)
df_merged_eval["recall"] = df_merged_eval.apply(lambda x: compute_recall(x['get_nested_values'], x['gold_get_nested_values']), axis=1)

In [9]:
df_merged_eval["average_precision"] = df_merged_eval.apply(lambda x: average_precision(x['get_nested_values'], x['gold_get_nested_values'], k_max=10000), axis=1)

In [10]:
df_merged_eval

Unnamed: 0,row,last_executed_step,to_be_executed_step,translated_prompt,status,has_error,input,target_template,target_raw,execution,executed_query,eval,get_nested_values,gold_eval,gold_get_nested_values,precision,recall,average_precision
2418,"""Can you find me all the dead authors who have...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you find me all the dead authors who have...",SELECT DISTINCT ?person ?personLabel ?personDe...,SELECT DISTINCT ?person ?personLabel ?personDe...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...",,[],0.0,0.0,0.0
2419,"""Can you find me all the mayors who are any ki...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you find me all the mayors who are any ki...",SELECT ?image ?speciesLabel ?mayorLabel ?place...,SELECT ?image ?speciesLabel ?mayorLabel ?place...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...","[{'speciesLabel': {'xml:lang': 'en', 'type': '...","[dog, Bosco the dog, Sunol, dog, Duke the Dog,...",0.0,0.0,0.0
2420,"""Can you retrieve a list of distinct instituti...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you retrieve a list of distinct instituti...",select distinct ?item ?itemLabel (sample(?logo...,select distinct ?item ?itemLabel (sample(?logo...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...",,[],0.0,0.0,0.0
2421,"""Can you retrieve a map of the highest points ...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you retrieve a map of the highest points ...",SELECT DISTINCT ?item ?itemLabel ?highestPoint...,SELECT DISTINCT ?item ?itemLabel ?highestPoint...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...","[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q107356467, ht...",0.0,0.0,0.0
2422,"""Can you retrieve information about long-runni...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you retrieve information about long-runni...",SELECT ?family ?familyLabel (MAX(?age) AS ?age...,SELECT ?family ?familyLabel (MAX(?age) AS ?age...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...",,[],0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840,"""Can you retrieve a list of movies directed by...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you retrieve a list of movies directed by...",SELECT ?film ?filmLabel ?seriesLabel ?duration...,SELECT ?film ?filmLabel ?seriesLabel ?duration...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...","[{'film': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q2085, Twin Pe...",0.0,0.0,0.0
2841,"""Can you find the rail link between Narvik, No...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you find the rail link between Narvik, No...",SELECT ?station ?stationLabel ?cds ?line ?laye...,SELECT ?station ?stationLabel ?cds ?line ?laye...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...","[{'station': {'type': 'uri', 'value': 'http://...","[http://www.wikidata.org/entity/Q98908332, Q98...",0.0,0.0,0.0
2842,"""Can you retrieve a list of fictional swords, ...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you retrieve a list of fictional swords, ...",SELECT ?item ?itemLabel ?ownerLabel ?workLabel...,SELECT ?item ?itemLabel ?ownerLabel ?workLabel...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...","[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q118836476, Q1...",0.0,0.0,0.0
2843,"""Can you write a SparQL query to count the num...",LLMTranslator,,SELECT ?item ?itemLabel WHERE {\n ?item w...,,False,"""Can you write a SparQL query to count the num...",SELECT ?partof (COUNT(*) as ?count)\n{\n?item ...,SELECT ?partof (COUNT(*) as ?count)\n{\n?item ...,"[{'item': {'type': 'uri', 'value': 'http://www...",SELECT ?item ?itemLabel WHERE {\n ?item w...,"[{'item': {'type': 'uri', 'value': 'http://www...","[http://www.wikidata.org/entity/Q23, George Wa...","[{'partof': {'type': 'uri', 'value': 'https://...","[https://kk.wikipedia.org/, 41942, https://lt....",0.0,0.0,0.0


In [11]:
y = df_merged_eval.iloc[0]
average_precision(get_nested_values(y['eval']), get_nested_values(y['gold_eval']), k_max=1)

0.0

In [12]:
df_merged_eval["average_precision"] = df_merged_eval.apply(lambda x: average_precision(x['get_nested_values'], x['gold_get_nested_values']), axis=1)

In [25]:
df_no_gen_fail.iloc[6]['input']

'"Can you find me all the movies on Wikidata that have a German title that implies someone or something doesn\'t answer?"'

In [14]:
print(df_no_gen_fail['translated_prompt'].iloc[10])

SELECT ?item ?itemLabel WHERE {
      ?item wdt:P31 wd:Q5.  # P31 is "instance of", Q5 is "human"
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }


In [15]:
m_precision = df_merged_eval['precision'].mean()
m_recall = df_merged_eval['recall'].mean()
m_fscore = 2*m_precision*m_recall/(m_precision+m_recall)

bleu_score = corpus_bleu([[x.split()] for x in df_no_gen_fail['target_template']], [x.split() for x in df_no_gen_fail['translated_prompt']])
meteor_score = corpus_meteor(df_no_gen_fail['target_template'], df_no_gen_fail['translated_prompt'])


In [16]:
serie = pd.Series(data=
    {
        "model_name": args.model,
        "num_rows": len(df),
        "num_gen_fail": len(df.loc[df['has_error'] == True]),
        "num_exec_timeout": len(df_exec_timeout),
        "num_exec_fail": len(df_exec_fail),
        "num_exec_empty": len(df_exec_empty),
        "num_exec_to_eval": len(df_exec_to_eval),
        "num_eval": len(df_eval),
        "num_eval_empty": len(df_eval.loc[df_eval['eval'].map(len) == 0]),
        "bleu_score": bleu_score,
        "meteor_score": meteor_score,
        "precision": m_precision,
        "recall": m_recall,
        "f1score": m_fscore
    })

In [17]:
serie

model_name          Mistral-7B-Instruct-v0.2
num_rows                                 427
num_gen_fail                               0
num_exec_timeout                           0
num_exec_fail                              0
num_exec_empty                             0
num_exec_to_eval                         427
num_eval                                 427
num_eval_empty                             0
bleu_score                          0.001934
meteor_score                        0.104065
precision                           0.001756
recall                               0.00084
f1score                             0.001136
dtype: object