In [31]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import single_meteor_score
from typing import List, Dict, Union
import argparse
import logging
import os
import pandas as pd
from ast import literal_eval
import nltk

def failed_generation_index(dataset: pd.DataFrame):
    return dataset.loc[dataset['has_error'] == True].index

def corpus_meteor(references: List, hypotheses: List):
    meteor_scores = 0.
    for ref, hyp in zip(references, hypotheses):
        meteor_scores += single_meteor_score(ref.split(), hyp.split())
    return meteor_scores / float(len(references))

def safe_eval(execution: str):
    """Evaluates """
    try:
        return literal_eval(execution)
    except Exception as inst:
        logging.error(f"Exception occured while evaluating: {inst}.")
        print(f"Exception occured while evaluating: {inst}.")
        return None

def eval_dataset(dataset: pd.DataFrame, col_name: str = "eval"):
    df_eval = dataset.copy()
    df_eval[col_name] = df_eval.apply(lambda x: safe_eval(x['execution']), axis=1)
    return df_eval[~df_eval[col_name].isnull()]

def get_nested_values(element: Union[Dict, str, None]):
    values = []
    if isinstance(element, dict):
        for k, v in element.items():
            if isinstance(v, dict):
                values += get_nested_values(v)
            elif isinstance(v, str):
                if 'value' in k:
                    values.append(v)
    elif isinstance(element, list):
        for el in element:
            values += get_nested_values(el)
    elif element is None:
        values = []
    else:
        logging.error(f"get_nested_values doesn't have an implementation for: {type(element)}.")
        raise TypeError(f"Compatible types are Dict and List, found: {type(element)}.")
    return values

def compute_precision(hypothesis: List, gold: List):
    shypothesis = set(hypothesis)
    sgold = set(gold)
    
    if len(shypothesis) == 0:
        return 1. if len(sgold) == 0 else 0.
    
    relevant = shypothesis.intersection(sgold)
    return len(relevant)/len(shypothesis)

def compute_recall(hypothesis: List, gold: List):
    shypothesis = set(hypothesis)
    sgold = set(gold)
    
    if len(sgold) == 0:
        return 1. if len(shypothesis) == 0 else 0.
    
    relevant = shypothesis.intersection(sgold)
    return len(relevant)/len(sgold)

def load_dataset(path: str):
    if path.endswith(('.parquet', '.parquet.gzip')):
        return pd.read_parquet(path, engine='auto')
    elif path.endswith('.json'):
        return pd.read_json(path)
    
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
arguments = {
    "dataset": "../outputs/batch_run/test2/execution/Mistral-7B-Instruct-v0.2_rv16-bs2-p0_t0.2-topp0.95_executed.parquet.gzip",
    "gold": "../datasets/queries_with_execution_results_with_limit.parquet.gzip",
    "model": "Mistral-7B-Instruct-v0.2",
    "output": ".",
    "save_name": "test",
    "log_level": "warning",
    "log_file": "",
}

In [3]:
args = argparse.Namespace()
args.__dict__.update(arguments)
print(args)

numeric_log_level = getattr(logging, args.log_level.upper(), None)
if not isinstance(numeric_log_level, int):
    raise ValueError(f"Invalid log level: {args.log_level}.")
logging.basicConfig(filename=args.log_file if args.log_file else None, level=numeric_log_level)

if not os.path.exists(args.dataset):
    raise FileNotFoundError(f"The dataset file not found with path: {args.dataset}")

if not os.path.exists(args.gold):
    raise FileNotFoundError(f"The gold dataset file not found with path: {args.gold}")



In [4]:
df = load_dataset(args.dataset)
df_gold = load_dataset(args.gold)

In [5]:
df_no_gen_fail = df.drop(failed_generation_index(df))
df_exec_timeout = df_no_gen_fail.loc[df_no_gen_fail['execution'] == 'timeout']
df_exec_fail = df_no_gen_fail.loc[df_no_gen_fail['execution'].str.startswith('exception')]
df_exec_empty = df_no_gen_fail.loc[df_no_gen_fail['execution'].isnull()]
df_exec_to_eval = df_no_gen_fail.drop(df_exec_timeout.index).drop(df_exec_fail.index).drop(df_exec_empty.index)
df_eval = eval_dataset(df_exec_to_eval)

In [6]:
df_gold_exec_timeout = df_gold.loc[df_gold['execution'] == 'timeout']
df_gold_exec_fail = df_gold.loc[df_gold['execution'].str.startswith('exception')]
df_gold_exec_empty = df_gold.loc[df_gold['execution'].isnull()]
df_gold_exec_to_eval = df_gold.drop(df_gold_exec_timeout.index).drop(df_gold_exec_fail.index).drop(df_gold_exec_empty.index)
df_gold_eval = eval_dataset(df_gold_exec_to_eval, "gold_eval")

In [7]:
df_eval['row']

2421    "Can you find out where the authors of Wikimam...
2429    "Can you retrieve all Wikipedia sites using Sp...
2431    "Can you find me all the historical buildings ...
2432    "Can you retrieve a list of alumni from the Un...
2435    "Can you retrieve the distinct names of Willem...
                              ...                        
2811    "Can you find out where people born in Brittan...
2814    "Can you retrieve all properties used to descr...
2819    "Can you write a SparQL query to find all item...
2822    "Can you write a SparQL query to find scholarl...
2836    "Can you find me all the groups of fictional c...
Name: row, Length: 87, dtype: object

In [56]:
print(df_exec_to_eval['translated_prompt'].iloc[0])

SELECT ?author ?authorLabel ?birthPlaceLabel WHERE {
  ?author wdt:P31 wd:Q1145.  # Q1145 is "person"
  ?author wdt:P19 wd:Q13563.  # Q13563 is "author"
  ?author wdt:P569 ?birthPlace.  # P569 is "date of birth"
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". bd:serviceParam wikibase:entityId ?birthPlace . }
}



In [10]:
df_gold_eval[['gold_eval']].loc['2421']

gold_eval    [{'item': {'type': 'uri', 'value': 'http://www...
Name: 2421, dtype: object

In [8]:
df_eval[['eval']].join(df_gold_eval[['gold_eval']], rsuffix="_gold")

Unnamed: 0,eval,gold_eval
2421,[],
2429,[],
2431,[],
2432,[],
2435,[],
...,...,...
2811,[],
2814,[],
2819,[],
2822,[],


In [14]:
def safe_loc(x, df, column, default=None):
    try:
        ans = df[[column]].loc[str(x.name)]
    except:
        ans = default
    return ans

In [15]:
df_eval[['eval']].apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval", None), axis=1)

Unnamed: 0,gold_eval
2421,"[{'item': {'type': 'uri', 'value': 'http://www..."
2429,"[{'consort': {'type': 'uri', 'value': 'http://..."
2431,"[{'item': {'type': 'uri', 'value': 'http://www..."
2432,"[{'item': {'type': 'uri', 'value': 'http://www..."
2435,
...,...
2811,"[{'lighthouse': {'type': 'uri', 'value': 'http..."
2814,"[{'item': {'type': 'uri', 'value': 'http://www..."
2819,"[{'airport': {'type': 'uri', 'value': 'http://..."
2822,"[{'item': {'type': 'uri', 'value': 'http://www..."


In [73]:
df_eval['eval'].loc[df_eval['eval'].map(len) != 0]

2570    [{'state': {'type': 'uri', 'value': 'http://ww...
2581    [{'count': {'datatype': 'http://www.w3.org/200...
2607    [{'numDirectors': {'datatype': 'http://www.w3....
2629    [{'numSpouses': {'datatype': 'http://www.w3.or...
2668    [{'num': {'datatype': 'http://www.w3.org/2001/...
2795    [{'region': {'type': 'uri', 'value': 'http://w...
Name: eval, dtype: object

In [17]:
df_merged_eval = df_eval.copy()
df_merged_eval["gold_eval"] = df_merged_eval.apply(lambda x: safe_loc(x, df_gold_eval, "gold_eval", None), axis=1)

In [19]:
df_merged_eval[['eval', 'gold_eval']]

Unnamed: 0,eval,gold_eval
2421,[],"[{'item': {'type': 'uri', 'value': 'http://www..."
2429,[],"[{'consort': {'type': 'uri', 'value': 'http://..."
2431,[],"[{'item': {'type': 'uri', 'value': 'http://www..."
2432,[],"[{'item': {'type': 'uri', 'value': 'http://www..."
2435,[],
...,...,...
2811,[],"[{'lighthouse': {'type': 'uri', 'value': 'http..."
2814,[],"[{'item': {'type': 'uri', 'value': 'http://www..."
2819,[],"[{'airport': {'type': 'uri', 'value': 'http://..."
2822,[],"[{'item': {'type': 'uri', 'value': 'http://www..."


In [23]:
df_merged_eval['precision'] = df_merged_eval.apply(lambda x: compute_precision(get_nested_values(x['eval']), get_nested_values(x['gold_eval'])), axis=1)

In [22]:
df_merged_eval['recall'] = df_merged_eval.apply(lambda x: compute_recall(get_nested_values(x['eval']), get_nested_values(x['gold_eval'])), axis=1)


In [24]:
df_merged_eval[['precision', 'recall']]

Unnamed: 0,precision,recall
2421,0.0,0.0
2429,0.0,0.0
2431,0.0,0.0
2432,0.0,0.0
2435,1.0,1.0
...,...,...
2811,0.0,0.0
2814,0.0,0.0
2819,0.0,0.0
2822,0.0,0.0


In [25]:
m_precision = df_merged_eval['precision'].mean()
m_recall = df_merged_eval['recall'].mean()
m_fscore = 2*m_precision*m_recall/(m_precision+m_recall)

print(f"{m_precision=}, {m_recall=}, {m_fscore=}")

m_precision=0.09195402298850575, m_recall=0.09195402298850575, m_fscore=0.09195402298850575


In [33]:
bleu_score = corpus_bleu([[x.split()] for x in df_no_gen_fail['target']], [x.split() for x in df_no_gen_fail['translated_prompt']])
meteor_score = corpus_meteor(df_no_gen_fail['target'], df_no_gen_fail['translated_prompt'])

print(f"{bleu_score=}, {meteor_score=}")

bleu_score=0.01493558729594148, meteor_score=0.1299193095224561


In [34]:
serie = pd.Series(data=
                {
                    "model_name": args.model,
                    "num_rows": len(df),
                    "num_gen_fail": len(df.loc[df['has_error'] == True]),
                    "num_exec_timeout": len(df_exec_timeout),
                    "num_exec_fail": len(df_exec_fail),
                    "num_exec_empty": len(df_exec_empty),
                    "num_exec_to_eval": len(df_exec_to_eval),
                    "num_eval": len(df_eval),
                    "bleu_score": bleu_score,
                    "meteor_score": meteor_score,
                    "precision": m_precision,
                    "recall": m_recall,
                    "f1score": m_fscore
                })
serie

model_name          Mistral-7B-Instruct-v0.2
num_rows                                 427
num_gen_fail                             107
num_exec_timeout                           1
num_exec_fail                            232
num_exec_empty                             0
num_exec_to_eval                          87
num_eval                                  87
bleu_score                          0.014936
meteor_score                        0.129919
precision                           0.091954
recall                              0.091954
f1score                             0.091954
dtype: object

81

In [35]:
os.makedirs(args.output, exist_ok=True)
serie.to_json(os.path.join(args.output, f"{args.save_name}.json"))