# Results for RQ1 - by error type

In [1]:
import os 
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import prettytable
import re 

cwd = os.getcwd()
os.chdir('../../')
import errorAPI
from errorAPI.dataset import Dataset
os.chdir(cwd)

In [2]:
sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'
performance_results = pd.read_sql_table("results", create_engine(sql_string)).drop_duplicates(['dataset', 'tool_name', 'tool_configuration'], keep='last')

In [3]:
## Config

group_by_cols = ["dataset", "tool_name"]
show_cols = ["cell_prec", "cell_rec", "cell_f1", "runtime", "error_text"]
max_col = "cell_prec"

max_human_cost = 20
min_human_accuracy = 1
max_human_accuracy = 1
max_runtime = 1900

## Filtered tools
filtered_tools = [
     'ActiveClean',
     'FAHES',
     'ForbiddenItemSets',
     'KATARA',
     'Raha',
     'dBoost'
]

sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'
dataset_stats = pd.read_sql_table("datasets", create_engine(sql_string)).iloc[:, 1:]
dataset_names = dataset_stats["name"].tolist()
tool_stats = pd.read_sql_table("tools",  create_engine(sql_string))

In [4]:
performance_results = performance_results[performance_results["human_cost"].fillna(0) <= max_human_cost]
performance_results = performance_results[performance_results["human_accuracy"].fillna(0) <= max_human_accuracy]
performance_results = performance_results[performance_results["human_accuracy"].fillna(1) >= min_human_accuracy]
performance_results = performance_results[performance_results["runtime"].fillna(0) <= max_runtime]

max_idx = performance_results.groupby(group_by_cols)[max_col].transform(max) == performance_results[max_col]
performance_results = performance_results[max_idx]

results_df = performance_results.groupby(group_by_cols)[show_cols].min()

# dataset_names = list(set([x[0] for x in results_df.index]))
# dataset_names.sort()
tool_names = list(set([x[1] for x in results_df.index]))
tool_names.sort()

tool_names = [x for x in tool_names if x in filtered_tools]
# dataset_names = [x for x in dataset_names if x not in exclude_datasets]

print("Tools:", tool_names)
print("Datasets:", dataset_names)

results_df = results_df[[x[1] in tool_names for x in results_df.index]]
results_df = results_df[[x[0] in dataset_names for x in results_df.index]]

Tools: ['ActiveClean', 'FAHES', 'ForbiddenItemSets', 'KATARA', 'Raha', 'dBoost']
Datasets: ['university', 'movies', 'restaurant', 'beers', 'uscensus', 'restaurants', 'eeg', 'flights', 'movie', 'hospital', 'toy', 'airbnb', 'marketing', 'rayyan']


In [5]:
err_types = [x for x in tool_stats.columns if "err" in x]

In [6]:
## Columns = datasets
output_df_datacols = {}

for err_type in err_types:
    data_dict = []
    
    tool_names_err_type = tool_stats[tool_stats[err_type]]["name"].tolist()
    dataset_names_err_type = dataset_stats[dataset_stats[err_type]]["name"].tolist()
    temp_df = results_df.reset_index()
    results_df_filtered = temp_df[temp_df["tool_name"].isin(tool_names_err_type) & temp_df["dataset"].isin(dataset_names_err_type)].set_index(["dataset", "tool_name"])
    
    for tool_name in tool_names_err_type:
        row = {}
        for dataset_name in dataset_names_err_type:
            try:
                values = results_df_filtered.loc[(dataset_name, tool_name)]
                result_string = ""
                for i, show_col in enumerate(show_cols):
                    if show_col in ["error_text", "runtime"]:
                        continue

                    is_max = results_df_filtered.loc[dataset_name][show_col].max() == values[i]
                    if values["error_text"] != "":
                        if "Timeout" in values["error_text"]:
                            result_string += values["error_text"]
                        else:
                            result_string += "Other error"
                        break
                    elif is_max:
                        result_string += "\textbf{" + "{:.2f}".format(values[i]) + "}"
                    else:
                        result_string += "{:.2f} ".format(values[i])

                    result_string += " "
                row[dataset_name] = result_string
            except KeyError as e:
                row[dataset_name] = ""
        data_dict.append(row)

    output_df_datacols[err_type] = pd.DataFrame(data_dict, columns = dataset_names_err_type, index = tool_names_err_type)

In [7]:
output_df_datacols_headers = {}

for err_type in output_df_datacols:
    print(err_type)
    output_df_datacols_header = output_df_datacols[err_type].copy()
    first_index = output_df_datacols_header.index
    output_df_datacols_header.loc["", :] = "\textbf{\space\space\space P \space\space\space\space R \space\space\space F1}"
    output_df_datacols_header = output_df_datacols_header.reindex([""] + first_index.tolist()).sort_index().sort_index(axis=1)
    display(output_df_datacols_header)
    output_df_datacols_headers[err_type] = output_df_datacols_header

err_pattern


Unnamed: 0,beers,flights,hospital,movies,rayyan,restaurant,restaurants
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
Raha,\textbf{1.00} \textbf{0.55} \textbf{0.71},0.94 \textbf{0.73} \textbf{0.82},\textbf{1.00} \textbf{0.06} \textbf{0.11},\textbf{0.72} \textbf{0.76} \textbf{0.74},\textbf{0.92} 0.70 \textbf{0.79},\textbf{1.00} 0.00 0.00,0.00 \textbf{1.00} \textbf{0.00}
dBoost,0.83 0.24 0.37,\textbf{0.95} 0.55 0.70,0.03 0.00 0.01,0.01 0.09 0.03,0.22 \textbf{0.77} 0.34,0.03 \textbf{0.03} \textbf{0.03},\textbf{0.00} 0.08 0.00


err_rules


Unnamed: 0,airbnb,beers,flights,marketing,movie,movies,rayyan,restaurant,toy,university,uscensus
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
ActiveClean,0.15 \textbf{1.00} \textbf{0.26},0.16 \textbf{1.00} 0.28,0.30 \textbf{0.98} 0.46,0.25 \textbf{0.36} 0.30,0.37 \textbf{1.00} \textbf{0.54},0.02 0.00 0.01,0.09 \textbf{1.00} 0.16,0.01 \textbf{0.83} 0.02,Other error,0.03 0.09 0.04,0.02 0.00 0.00
FAHES,0.54 0.01 0.02,0.83 0.02 0.04,0.23 0.01 0.02,0.24 0.01 0.01,0.00 0.00 0.00,0.22 0.00 0.00,0.07 0.04 0.05,0.00 0.00 0.00,0.00 0.00 0.00,0.00 0.00 0.00,0.01 0.18 0.02
ForbiddenItemSets,0.13 0.29 0.18,0.34 0.30 0.32,0.81 0.00 0.01,0.28 0.01 0.02,0.50 0.00 0.00,0.01 0.06 0.01,Other error,0.01 0.02 0.01,0.00 0.00 0.00,Other error,0.02 0.26 0.04
KATARA,Other error,0.14 0.26 0.18,0.09 0.09 0.09,0.22 0.19 0.20,0.45 0.41 0.43,0.02 0.16 0.03,0.01 0.02 0.01,0.00 0.13 0.01,0.21 0.75 0.33,0.06 0.29 0.10,0.00 0.00 0.00
Raha,0.49 0.09 0.16,\textbf{1.00} 0.55 \textbf{0.71},0.94 0.73 \textbf{0.82},0.71 0.21 \textbf{0.32},\textbf{0.71} 0.23 0.34,\textbf{0.72} \textbf{0.76} \textbf{0.74},\textbf{0.92} 0.70 \textbf{0.79},\textbf{1.00} 0.00 0.00,0.22 \textbf{1.00} \textbf{0.36},\textbf{0.99} \textbf{0.91} \textbf{0.95},\textbf{1.00} 0.86 \textbf{0.93}
dBoost,\textbf{0.77} 0.04 0.08,0.83 0.24 0.37,\textbf{0.95} 0.55 0.70,\textbf{0.98} 0.08 0.14,0.55 0.29 0.38,0.01 0.09 0.03,0.22 0.77 0.34,0.03 0.03 \textbf{0.03},\textbf{0.50} 0.25 0.33,0.50 0.04 0.07,0.41 \textbf{1.00} 0.58


err_outliers


Unnamed: 0,airbnb,eeg
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
Raha,0.49 \textbf{0.09} \textbf{0.16},0.56 \textbf{0.71} \textbf{0.63}
dBoost,\textbf{0.77} 0.04 0.08,\textbf{0.57} 0.01 0.01


err_duplicates


Unnamed: 0,airbnb,movie
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
ActiveClean,0.15 \textbf{1.00} \textbf{0.26},0.37 \textbf{1.00} \textbf{0.54}
Raha,\textbf{0.49} 0.09 0.16,\textbf{0.71} 0.23 0.34


In [8]:
output_df_datacols_headers_T = {}
for err_type in output_df_datacols:
    print(err_type)
    output_df_datacols_header_T = output_df_datacols[err_type].T.copy()
    first_index = output_df_datacols_header_T.index
    output_df_datacols_header_T.loc["", :] = "\textbf{\space\space\space P \space\space\space\space R \space\space\space F1}"
    output_df_datacols_header_T = output_df_datacols_header_T.reindex([""] + first_index.tolist()).sort_index().sort_index(axis=1)
    display(output_df_datacols_header_T)
    
    output_df_datacols_headers_T[err_type] = output_df_datacols_header_T

err_pattern


Unnamed: 0,Raha,dBoost
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
beers,\textbf{1.00} \textbf{0.55} \textbf{0.71},0.83 0.24 0.37
flights,0.94 \textbf{0.73} \textbf{0.82},\textbf{0.95} 0.55 0.70
hospital,\textbf{1.00} \textbf{0.06} \textbf{0.11},0.03 0.00 0.01
movies,\textbf{0.72} \textbf{0.76} \textbf{0.74},0.01 0.09 0.03
rayyan,\textbf{0.92} 0.70 \textbf{0.79},0.22 \textbf{0.77} 0.34
restaurant,\textbf{1.00} 0.00 0.00,0.03 \textbf{0.03} \textbf{0.03}
restaurants,0.00 \textbf{1.00} \textbf{0.00},\textbf{0.00} 0.08 0.00


err_rules


Unnamed: 0,ActiveClean,FAHES,ForbiddenItemSets,KATARA,Raha,dBoost
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
airbnb,0.15 \textbf{1.00} \textbf{0.26},0.54 0.01 0.02,0.13 0.29 0.18,Other error,0.49 0.09 0.16,\textbf{0.77} 0.04 0.08
beers,0.16 \textbf{1.00} 0.28,0.83 0.02 0.04,0.34 0.30 0.32,0.14 0.26 0.18,\textbf{1.00} 0.55 \textbf{0.71},0.83 0.24 0.37
flights,0.30 \textbf{0.98} 0.46,0.23 0.01 0.02,0.81 0.00 0.01,0.09 0.09 0.09,0.94 0.73 \textbf{0.82},\textbf{0.95} 0.55 0.70
marketing,0.25 \textbf{0.36} 0.30,0.24 0.01 0.01,0.28 0.01 0.02,0.22 0.19 0.20,0.71 0.21 \textbf{0.32},\textbf{0.98} 0.08 0.14
movie,0.37 \textbf{1.00} \textbf{0.54},0.00 0.00 0.00,0.50 0.00 0.00,0.45 0.41 0.43,\textbf{0.71} 0.23 0.34,0.55 0.29 0.38
movies,0.02 0.00 0.01,0.22 0.00 0.00,0.01 0.06 0.01,0.02 0.16 0.03,\textbf{0.72} \textbf{0.76} \textbf{0.74},0.01 0.09 0.03
rayyan,0.09 \textbf{1.00} 0.16,0.07 0.04 0.05,Other error,0.01 0.02 0.01,\textbf{0.92} 0.70 \textbf{0.79},0.22 0.77 0.34
restaurant,0.01 \textbf{0.83} 0.02,0.00 0.00 0.00,0.01 0.02 0.01,0.00 0.13 0.01,\textbf{1.00} 0.00 0.00,0.03 0.03 \textbf{0.03}
toy,Other error,0.00 0.00 0.00,0.00 0.00 0.00,0.21 0.75 0.33,0.22 \textbf{1.00} \textbf{0.36},\textbf{0.50} 0.25 0.33


err_outliers


Unnamed: 0,Raha,dBoost
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
airbnb,0.49 \textbf{0.09} \textbf{0.16},\textbf{0.77} 0.04 0.08
eeg,0.56 \textbf{0.71} \textbf{0.63},\textbf{0.57} 0.01 0.01


err_duplicates


Unnamed: 0,ActiveClean,Raha
,\textbf{\space\space\space P \space\space\spac...,\textbf{\space\space\space P \space\space\spac...
airbnb,0.15 \textbf{1.00} \textbf{0.26},\textbf{0.49} 0.09 0.16
movie,0.37 \textbf{1.00} \textbf{0.54},\textbf{0.71} 0.23 0.34


In [9]:
for err_type in output_df_datacols:
    print("-="*20)
    print(err_type)
    print("-="*20)
    print()
    captionstr1 = "|Precision Recall F1-score| for dataset as columns \& tool as row"
    with pd.option_context("max_colwidth", 1000):
        print(output_df_datacols_headers[err_type].sort_index().sort_index(axis=1).to_latex(escape=False, caption=captionstr1))
    
    print()
    print()
    print()

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
err_pattern
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

\begin{table}
\centering
\caption{|Precision Recall F1-score| for dataset as columns \& tool as row}
\begin{tabular}{llllllll}
\toprule
{} &                                                                           beers &                                                                         flights &                                                                        hospital &                                                                          movies &                                                                          rayyan &                                                                      restaurant &                                                                     restaurants \\
\midrule
       &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} &  \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} &  \textb

In [10]:
for err_type in output_df_datacols_headers_T:
    print("-="*20)
    print(err_type)
    print("-="*20)
    print()

    captionstr2 = "|Precision Recall F1-score| for tool as column \& dataset as row"
    with pd.option_context("max_colwidth", 1000):
        print(re.sub(' +', ' ', output_df_datacols_headers_T[err_type].sort_index().sort_index(axis=1).to_latex(escape=False, caption=captionstr2)))

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
err_pattern
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

\begin{table}
\centering
\caption{|Precision Recall F1-score| for tool as column \& dataset as row}
\begin{tabular}{lll}
\toprule
{} & Raha & dBoost \\
\midrule
 & \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} & \textbf{\space\space\space P \space\space\space\space R \space\space\space F1} \\
beers & \textbf{1.00} \textbf{0.55} \textbf{0.71} & 0.83 0.24 0.37 \\
flights & 0.94 \textbf{0.73} \textbf{0.82} & \textbf{0.95} 0.55 0.70 \\
hospital & \textbf{1.00} \textbf{0.06} \textbf{0.11} & 0.03 0.00 0.01 \\
movies & \textbf{0.72} \textbf{0.76} \textbf{0.74} & 0.01 0.09 0.03 \\
rayyan & \textbf{0.92} 0.70 \textbf{0.79} & 0.22 \textbf{0.77} 0.34 \\
restaurant & \textbf{1.00} 0.00 0.00 & 0.03 \textbf{0.03} \textbf{0.03} \\
restaurants & 0.00 \textbf{1.00} \textbf{0.00} & \textbf{0.00} 0.08 0.00 \\
\bottomrule
\end{tabular}
\end{table}

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-