# Results for RQ1 - by error type

In [1]:
import os 
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import prettytable
import re 

cwd = os.getcwd()
os.chdir('../../')
import errorAPI
from errorAPI.dataset import Dataset
os.chdir(cwd)

In [2]:
sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'
performance_results = pd.read_sql_table("results", create_engine(sql_string)).drop_duplicates(['dataset', 'tool_name', 'tool_configuration'], keep='last')

In [3]:
## Config

group_by_cols = ["dataset", "tool_name"]
show_cols = ["cell_prec", "cell_rec", "cell_f1", "runtime", "error_text"]
max_col = "cell_f1"

max_human_cost = 20
min_human_accuracy = 1
max_human_accuracy = 1
max_runtime = 1900

## Filtered tools
filtered_tools = [
     'ActiveClean',
     'FAHES',
     'ForbiddenItemSets',
     'KATARA',
     'Raha',
     'dBoost'
]

sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'
dataset_stats = pd.read_sql_table("datasets", create_engine(sql_string)).iloc[:, 1:]
dataset_names = dataset_stats["name"].tolist()
tool_stats = pd.read_sql_table("tools",  create_engine(sql_string))

In [4]:
performance_results = performance_results[performance_results["human_cost"].fillna(0) <= max_human_cost]
performance_results = performance_results[performance_results["human_accuracy"].fillna(0) <= max_human_accuracy]
performance_results = performance_results[performance_results["human_accuracy"].fillna(1) >= min_human_accuracy]
performance_results = performance_results[performance_results["runtime"].fillna(0) <= max_runtime]

max_idx = performance_results.groupby(group_by_cols)[max_col].transform(max) == performance_results[max_col]
performance_results = performance_results[max_idx]

results_df = performance_results.groupby(group_by_cols)[show_cols].min()

# dataset_names = list(set([x[0] for x in results_df.index]))
# dataset_names.sort()
tool_names = list(set([x[1] for x in results_df.index]))
tool_names.sort()

tool_names = [x for x in tool_names if x in filtered_tools]
# dataset_names = [x for x in dataset_names if x not in exclude_datasets]

print("Tools:", tool_names)
print("Datasets:", dataset_names)

results_df = results_df[[x[1] in tool_names for x in results_df.index]]
results_df = results_df[[x[0] in dataset_names for x in results_df.index]]

Tools: ['ActiveClean', 'FAHES', 'ForbiddenItemSets', 'KATARA', 'Raha', 'dBoost']
Datasets: ['university', 'movies', 'restaurant', 'beers', 'uscensus', 'restaurants', 'eeg', 'flights', 'movie', 'hospital', 'toy', 'airbnb', 'marketing', 'rayyan']


In [12]:
err_types = [x for x in tool_stats.columns if "err" in x]

In [26]:
## Columns = datasets
output_df_datacols = {}

for err_type in err_types:
    data_dict = []
    
    tool_names_err_type = tool_stats[tool_stats[err_type]]["name"].tolist()
    dataset_names_err_type = dataset_stats[dataset_stats[err_type]]["name"].tolist()
    temp_df = results_df.reset_index()
    results_df_filtered = temp_df[temp_df["tool_name"].isin(tool_names_err_type) & temp_df["dataset"].isin(dataset_names_err_type)].set_index(["dataset", "tool_name"])
    
    for tool_name in tool_names_err_type:
        row = {}
        for dataset_name in dataset_names_err_type:
            try:
                values = results_df_filtered.loc[(dataset_name, tool_name)]
                result_string = ""
                for i, show_col in enumerate(show_cols):
                    if show_col in ["error_text", "runtime"]:
                        continue

                    is_max = results_df_filtered.loc[dataset_name][show_col].max() == values[i]
                    if values["error_text"] != "":
                        if "Timeout" in values["error_text"]:
                            result_string += values["error_text"]
                        else:
                            result_string += "Other error"
                        break
                    elif is_max:
                        result_string += "\textbf{" + "{:.2f}".format(values[i]) + "}"
                    else:
                        result_string += "{:.2f} ".format(values[i])

                    result_string += " "
                row[dataset_name] = result_string
            except KeyError as e:
                row[dataset_name] = ""
        data_dict.append(row)

    output_df_datacols[err_type] = pd.DataFrame(data_dict, columns = dataset_names_err_type, index = tool_names_err_type)

In [27]:
output_df_datacols_headers = {}

for err_type in output_df_datacols:
    print(err_type)
    output_df_datacols_header = output_df_datacols[err_type].copy()
    first_index = output_df_datacols_header.index
    output_df_datacols_header.loc["", :] = "\textbf{\space\space\space P \space\space\space\space R \space\space\space F1}"
    output_df_datacols_header = output_df_datacols_header.reindex([""] + first_index.tolist())
    display(output_df_datacols_header)
    output_df_datacols_headers[err_type] = output_df_datacols_header

err_pattern


Unnamed: 0,movies,restaurant,beers,restaurants,flights,hospital,rayyan
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
ActiveClean,0.02 0.00 0.01,0.01 \textbf{0.83} 0.02,0.16 \textbf{1.00} 0.28,0.00 0.00 0.00,0.30 \textbf{0.98} 0.46,0.03 0.47 0.05,0.09 \textbf{1.00} 0.16
Raha,\textbf{0.72} \textbf{0.76} \textbf{0.74},\textbf{0.14} 0.11 \textbf{0.12},\textbf{0.97} 0.69 \textbf{0.80},0.00 \textbf{1.00} \textbf{0.00},0.90 0.83 \textbf{0.86},\textbf{0.98} \textbf{0.57} \textbf{0.72},\textbf{0.86} 0.84 \textbf{0.85}
dBoost,0.01 0.09 0.03,0.03 0.03 0.03,0.68 0.55 0.61,\textbf{0.00} 0.08 0.00,\textbf{0.94} 0.59 0.72,0.03 0.43 0.06,0.22 0.77 0.34


err_rules


Unnamed: 0,university,movies,restaurant,beers,uscensus,flights,movie,toy,airbnb,marketing,rayyan
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
FAHES,0.00 0.00 0.00,0.01 0.10 0.02,0.00 0.00 0.00,0.83 0.02 0.04,0.01 0.18 0.02,0.23 0.01 0.02,0.00 0.00 0.00,0.00 0.00 0.00,\textbf{0.54} 0.01 0.02,0.24 0.01 0.01,0.07 0.04 0.05
KATARA,0.06 0.29 0.10,0.02 0.16 0.03,0.00 0.13 0.01,0.14 0.26 0.18,0.00 0.00 0.00,0.09 0.09 0.09,0.43 0.43 0.43,0.21 0.75 0.33,Other error,0.21 0.32 0.25,0.01 0.02 0.01
ActiveClean,0.03 0.09 0.04,0.02 0.00 0.01,0.01 \textbf{0.83} 0.02,0.16 \textbf{1.00} 0.28,0.02 0.00 0.00,0.30 \textbf{0.98} 0.46,0.37 \textbf{1.00} 0.54,Other error,0.15 \textbf{1.00} \textbf{0.26},0.25 0.36 0.30,0.09 \textbf{1.00} 0.16
ForbiddenItemSets,Other error,0.01 0.06 0.01,0.01 0.07 0.01,0.34 0.30 0.32,0.02 0.26 0.04,0.56 0.16 0.24,0.31 0.08 0.13,0.00 0.00 0.00,0.13 0.29 0.18,0.25 \textbf{0.46} 0.33,Other error
Raha,\textbf{0.99} \textbf{0.91} \textbf{0.95},\textbf{0.72} \textbf{0.76} \textbf{0.74},\textbf{0.14} 0.11 \textbf{0.12},\textbf{0.97} 0.69 \textbf{0.80},\textbf{1.00} \textbf{1.00} \textbf{1.00},\textbf{0.90} 0.83 \textbf{0.86},\textbf{0.47} 0.64 \textbf{0.54},\textbf{0.22} \textbf{1.00} \textbf{0.36},0.42 0.13 0.20,\textbf{0.50} 0.32 \textbf{0.39},\textbf{0.86} 0.84 \textbf{0.85}


err_outliers


Unnamed: 0,eeg,airbnb
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
ForbiddenItemSets,0.02 0.26 0.04,0.13 0.29 0.18
Raha,\textbf{0.56} 0.71 \textbf{0.63},\textbf{0.42} 0.13 0.20
dBoost,0.13 \textbf{1.00} 0.23,0.23 \textbf{0.38} \textbf{0.28}


err_duplicates


Unnamed: 0,movie,airbnb
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
ForbiddenItemSets,0.31 0.08 0.13,0.13 \textbf{0.29} 0.18
Raha,\textbf{0.47} \textbf{0.64} \textbf{0.54},\textbf{0.42} 0.13 \textbf{0.20}


In [28]:
output_df_datacols_headers_T = {}
for err_type in output_df_datacols:
    print(err_type)
    output_df_datacols_header_T = output_df_datacols[err_type].T.copy()
    first_index = output_df_datacols_header_T.index
    output_df_datacols_header_T.loc["", :] = "\textbf{\space\space\space P \space\space\space\space R \space\space\space F1}"
    output_df_datacols_header_T = output_df_datacols_header_T.reindex([""] + first_index.tolist())
    display(output_df_datacols_header_T)
    
    output_df_datacols_headers_T[err_type] = output_df_datacols_header

err_pattern


Unnamed: 0,ActiveClean,Raha,dBoost
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
movies,0.02 0.00 0.01,\textbf{0.72} \textbf{0.76} \textbf{0.74},0.01 0.09 0.03
restaurant,0.01 \textbf{0.83} 0.02,\textbf{0.14} 0.11 \textbf{0.12},0.03 0.03 0.03
beers,0.16 \textbf{1.00} 0.28,\textbf{0.97} 0.69 \textbf{0.80},0.68 0.55 0.61
restaurants,0.00 0.00 0.00,0.00 \textbf{1.00} \textbf{0.00},\textbf{0.00} 0.08 0.00
flights,0.30 \textbf{0.98} 0.46,0.90 0.83 \textbf{0.86},\textbf{0.94} 0.59 0.72
hospital,0.03 0.47 0.05,\textbf{0.98} \textbf{0.57} \textbf{0.72},0.03 0.43 0.06
rayyan,0.09 \textbf{1.00} 0.16,\textbf{0.86} 0.84 \textbf{0.85},0.22 0.77 0.34


err_rules


Unnamed: 0,FAHES,KATARA,ActiveClean,ForbiddenItemSets,Raha
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
university,0.00 0.00 0.00,0.06 0.29 0.10,0.03 0.09 0.04,Other error,\textbf{0.99} \textbf{0.91} \textbf{0.95}
movies,0.01 0.10 0.02,0.02 0.16 0.03,0.02 0.00 0.01,0.01 0.06 0.01,\textbf{0.72} \textbf{0.76} \textbf{0.74}
restaurant,0.00 0.00 0.00,0.00 0.13 0.01,0.01 \textbf{0.83} 0.02,0.01 0.07 0.01,\textbf{0.14} 0.11 \textbf{0.12}
beers,0.83 0.02 0.04,0.14 0.26 0.18,0.16 \textbf{1.00} 0.28,0.34 0.30 0.32,\textbf{0.97} 0.69 \textbf{0.80}
uscensus,0.01 0.18 0.02,0.00 0.00 0.00,0.02 0.00 0.00,0.02 0.26 0.04,\textbf{1.00} \textbf{1.00} \textbf{1.00}
flights,0.23 0.01 0.02,0.09 0.09 0.09,0.30 \textbf{0.98} 0.46,0.56 0.16 0.24,\textbf{0.90} 0.83 \textbf{0.86}
movie,0.00 0.00 0.00,0.43 0.43 0.43,0.37 \textbf{1.00} 0.54,0.31 0.08 0.13,\textbf{0.47} 0.64 \textbf{0.54}
toy,0.00 0.00 0.00,0.21 0.75 0.33,Other error,0.00 0.00 0.00,\textbf{0.22} \textbf{1.00} \textbf{0.36}
airbnb,\textbf{0.54} 0.01 0.02,Other error,0.15 \textbf{1.00} \textbf{0.26},0.13 0.29 0.18,0.42 0.13 0.20


err_outliers


Unnamed: 0,ForbiddenItemSets,Raha,dBoost
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
eeg,0.02 0.26 0.04,\textbf{0.56} 0.71 \textbf{0.63},0.13 \textbf{1.00} 0.23
airbnb,0.13 0.29 0.18,\textbf{0.42} 0.13 0.20,0.23 \textbf{0.38} \textbf{0.28}


err_duplicates


Unnamed: 0,ForbiddenItemSets,Raha
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
movie,0.31 0.08 0.13,\textbf{0.47} \textbf{0.64} \textbf{0.54}
airbnb,0.13 \textbf{0.29} 0.18,\textbf{0.42} 0.13 \textbf{0.20}


In [29]:
for err_type in output_df_datacols:
    print("-="*20)
    print(err_type)
    print("-="*20)
    print()
    captionstr1 = "|Precision Recall F1-score| for dataset as columns \& tool as row"
    with pd.option_context("max_colwidth", 1000):
        print(output_df_datacols_headers[err_type].to_latex(escape=False, caption=captionstr1))
    
    print()
    print()
    print()

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
err_pattern
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

\begin{table}
\centering
\caption{|Precision Recall F1-score| for dataset as columns \& tool as row}
\begin{tabular}{llllllll}
\toprule
{} &                                                                          movies &                                                                      restaurant &                                                                           beers &                                                                     restaurants &                                                                         flights &                                                                        hospital &                                                                          rayyan \\
\midrule
            &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} &  \

In [30]:
for err_type in output_df_datacols_headers_T:
    print("-="*20)
    print(err_type)
    print("-="*20)
    print()

    captionstr2 = "|Precision Recall F1-score| for tool as column \& dataset as row"
    with pd.option_context("max_colwidth", 1000):
        print(re.sub(' +', ' ', output_df_datacols_headers_T[err_type].to_latex(escape=False, caption=captionstr2)))

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
err_pattern
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

\begin{table}
\centering
\caption{|Precision Recall F1-score| for tool as column \& dataset as row}
\begin{tabular}{lll}
\toprule
{} & movie & airbnb \\
\midrule
 & \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} & \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} \\
ForbiddenItemSets & 0.31 0.08 0.13 & 0.13 \textbf{0.29} 0.18 \\
Raha & \textbf{0.47} \textbf{0.64} \textbf{0.54} & \textbf{0.42} 0.13 \textbf{0.20} \\
\bottomrule
\end{tabular}
\end{table}

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
err_rules
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

\begin{table}
\centering
\caption{|Precision Recall F1-score| for tool as column \& dataset as row}
\begin{tabular}{lll}
\toprule
{} & movie & airbnb \\
\midrule
 & \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} & \textbf{\space\space\space P \space\space\space\