# Results for RQ1

In [1]:
import os 
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import prettytable

cwd = os.getcwd()
os.chdir('../../')
import errorAPI
from errorAPI.dataset import Dataset
os.chdir(cwd)

In [2]:
sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'
performance_results = pd.read_sql_table("results", create_engine(sql_string)).drop_duplicates(['dataset', 'tool_name', 'tool_configuration'], keep='last')

In [3]:
## Config

group_by_cols = ["dataset", "tool_name"]
show_cols = ["cell_prec", "cell_rec", "cell_f1", "runtime", "error_text"]
max_col = "cell_f1"

max_human_cost = 0
min_human_accuracy = 1
max_human_accuracy = 1
max_runtime = 1900

## Filtered tools
filtered_tools = [
     'ActiveClean',
     'FAHES',
     'ForbiddenItemSets',
     'KATARA',
     'Raha',
     'dBoost'
]

sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'
dataset_stats = pd.read_sql_table("datasets", create_engine(sql_string)).iloc[:, 1:]
dataset_names = dataset_stats["name"].tolist()

In [4]:
performance_results = performance_results[performance_results["human_cost"].fillna(0) <= max_human_cost]
performance_results = performance_results[performance_results["human_accuracy"].fillna(0) <= max_human_accuracy]
performance_results = performance_results[performance_results["human_accuracy"].fillna(1) >= min_human_accuracy]
performance_results = performance_results[performance_results["runtime"].fillna(0) <= max_runtime]

max_idx = performance_results.groupby(group_by_cols)[max_col].transform(max) == performance_results[max_col]
performance_results = performance_results[max_idx]

results_df = performance_results.groupby(group_by_cols)[show_cols].min()

# dataset_names = list(set([x[0] for x in results_df.index]))
# dataset_names.sort()
tool_names = list(set([x[1] for x in results_df.index]))
tool_names.sort()

tool_names = [x for x in tool_names if x in filtered_tools]
# dataset_names = [x for x in dataset_names if x not in exclude_datasets]

print("Tools:", tool_names)
print("Datasets:", dataset_names)

results_df = results_df[[x[1] in tool_names for x in results_df.index]]
results_df = results_df[[x[0] in dataset_names for x in results_df.index]]

Tools: ['FAHES', 'ForbiddenItemSets', 'KATARA', 'Raha', 'dBoost']
Datasets: ['airbnb', 'beers', 'eeg', 'flights', 'hospital', 'marketing', 'movie', 'movies', 'rayyan', 'restaurant', 'restaurants', 'toy', 'university', 'uscensus']


In [5]:
## Columns = datasets


data_dict = []
    
for tool_name in tool_names:
    row = {}
    for dataset_name in dataset_names:
        try:
            values = results_df.loc[(dataset_name, tool_name)]
            result_string = ""
            for i, show_col in enumerate(show_cols):
                if show_col in ["error_text", "runtime"]:
                    continue
                
                is_max = results_df.loc[dataset_name][show_col].max() == values[i]
                if values["error_text"] != "":
                    if "Timeout" in values["error_text"]:
                        result_string += values["error_text"]
                    else:
                        result_string += "Other error"
                    break
                elif is_max:
                    result_string += "\textbf{" + "{:.2f}".format(values[i]) + "}"
                else:
                    result_string += "{:.2f} ".format(values[i])
                    
                result_string += " "
            row[dataset_name] = result_string
        except KeyError as e:
            row[dataset_name] = ""
    data_dict.append(row)

output_df_datacols = pd.DataFrame(data_dict, columns = dataset_names, index = tool_names)

In [6]:
output_df_datacols_header = output_df_datacols.copy()
first_index = output_df_datacols_header.index
output_df_datacols_header.loc["", :] = "\textbf{\space\space\space P \space\space\space\space R \space\space\space F1}"
output_df_datacols_header = output_df_datacols_header.reindex([""] + first_index.tolist())
output_df_datacols_header

Unnamed: 0,airbnb,beers,eeg,flights,hospital,marketing,movie,movies,rayyan,restaurant,restaurants,toy,university,uscensus
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
FAHES,\textbf{0.54} 0.01 0.02,\textbf{0.83} 0.02 0.04,0.00 0.00 0.00,0.23 0.01 0.02,0.02 0.09 0.04,0.24 0.01 0.01,0.00 0.00 0.00,0.01 0.10 0.02,0.07 0.04 0.05,0.00 0.00 0.00,\textbf{0.00} 0.07 \textbf{0.01},0.00 0.00 0.00,0.00 0.00 0.00,0.01 0.18 0.02
ForbiddenItemSets,0.13 0.29 0.18,0.34 0.30 0.32,0.02 0.26 0.04,0.56 0.16 0.24,0.01 0.06 0.02,0.25 0.46 0.33,0.31 0.08 0.13,0.01 0.06 0.01,Other error,0.01 0.07 0.01,Other error,0.00 0.00 0.00,Other error,0.02 0.26 0.04
KATARA,Other error,0.14 0.26 0.18,0.00 0.00 0.00,0.09 0.09 0.09,\textbf{0.08} 0.37 \textbf{0.13},0.21 0.32 0.25,\textbf{0.43} 0.43 0.43,\textbf{0.02} 0.16 \textbf{0.03},0.01 0.02 0.01,0.00 0.13 0.01,0.00 0.22 0.00,0.21 0.75 0.33,0.06 0.29 0.10,0.00 0.00 0.00
Raha,Other error,0.16 \textbf{1.00} 0.28,Other error,0.30 \textbf{1.00} 0.46,0.03 \textbf{1.00} 0.05,Other error,Other error,0.01 \textbf{1.00} 0.02,0.09 \textbf{1.00} 0.16,0.00 \textbf{1.00} 0.01,0.00 \textbf{1.00} 0.00,0.22 \textbf{1.00} 0.36,0.03 \textbf{1.00} 0.05,Other error
dBoost,0.23 \textbf{0.38} \textbf{0.28},0.68 0.55 \textbf{0.61},\textbf{0.13} \textbf{1.00} \textbf{0.23},\textbf{0.94} 0.59 \textbf{0.72},0.03 0.43 0.06,\textbf{0.34} \textbf{0.67} \textbf{0.45},0.37 \textbf{1.00} \textbf{0.54},0.01 0.09 0.03,\textbf{0.22} 0.77 \textbf{0.34},\textbf{0.03} 0.03 \textbf{0.03},0.00 0.08 0.00,\textbf{0.33} 0.75 \textbf{0.50},\textbf{0.32} \textbf{1.00} \textbf{0.49},\textbf{0.41} \textbf{1.00} \textbf{0.58}


In [7]:
output_df_datacols_header_T = output_df_datacols.T.copy()
first_index = output_df_datacols_header_T.index
output_df_datacols_header_T.loc["", :] = "\textbf{\space\space\space P \space\space\space\space R \space\space\space F1}"
output_df_datacols_header_T = output_df_datacols_header_T.reindex([""] + first_index.tolist())
output_df_datacols_header_T

Unnamed: 0,FAHES,ForbiddenItemSets,KATARA,Raha,dBoost
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
airbnb,\textbf{0.54} 0.01 0.02,0.13 0.29 0.18,Other error,Other error,0.23 \textbf{0.38} \textbf{0.28}
beers,\textbf{0.83} 0.02 0.04,0.34 0.30 0.32,0.14 0.26 0.18,0.16 \textbf{1.00} 0.28,0.68 0.55 \textbf{0.61}
eeg,0.00 0.00 0.00,0.02 0.26 0.04,0.00 0.00 0.00,Other error,\textbf{0.13} \textbf{1.00} \textbf{0.23}
flights,0.23 0.01 0.02,0.56 0.16 0.24,0.09 0.09 0.09,0.30 \textbf{1.00} 0.46,\textbf{0.94} 0.59 \textbf{0.72}
hospital,0.02 0.09 0.04,0.01 0.06 0.02,\textbf{0.08} 0.37 \textbf{0.13},0.03 \textbf{1.00} 0.05,0.03 0.43 0.06
marketing,0.24 0.01 0.01,0.25 0.46 0.33,0.21 0.32 0.25,Other error,\textbf{0.34} \textbf{0.67} \textbf{0.45}
movie,0.00 0.00 0.00,0.31 0.08 0.13,\textbf{0.43} 0.43 0.43,Other error,0.37 \textbf{1.00} \textbf{0.54}
movies,0.01 0.10 0.02,0.01 0.06 0.01,\textbf{0.02} 0.16 \textbf{0.03},0.01 \textbf{1.00} 0.02,0.01 0.09 0.03
rayyan,0.07 0.04 0.05,Other error,0.01 0.02 0.01,0.09 \textbf{1.00} 0.16,\textbf{0.22} 0.77 \textbf{0.34}


In [8]:
captionstr1 = "|Precision Recall F1-score| for dataset as columns \& tool as row"
with pd.option_context("max_colwidth", 1000):
    print(output_df_datacols_header.to_latex(escape=False, caption=captionstr1))

\begin{table}
\centering
\caption{|Precision Recall F1-score| for dataset as columns \& tool as row}
\begin{tabular}{lllllllllllllll}
\toprule
{} &                                                                          airbnb &                                                                           beers &                                                                             eeg &                                                                         flights &                                                                        hospital &                                                                       marketing &                                                                           movie &                                                                          movies &                                                                          rayyan &                                                                      restaurant &                                 

In [9]:
captionstr2 = "|Precision Recall F1-score| for tool as column \& dataset as row"
with pd.option_context("max_colwidth", 1000):
    print(output_df_datacols_header_T.to_latex(escape=False, caption=captionstr2))

\begin{table}
\centering
\caption{|Precision Recall F1-score| for tool as column \& dataset as row}
\begin{tabular}{llllll}
\toprule
{} &                                                                           FAHES &                                                               ForbiddenItemSets &                                                                          KATARA &                                                                            Raha &                                                                          dBoost \\
\midrule
            &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} \\
airbnb      &    