In [71]:
import errorAPI
from errorAPI.dataset import Dataset


import pandas as pd
import numpy as np
from typing import Type
import nltk
import re
import operator
import string
import matplotlib.pyplot as plt

import pickle
from sqlalchemy import create_engine
import ipywidgets as widgets
from IPython.display import clear_output
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import sklearn.ensemble
import sklearn.neural_network
import math

In [None]:
if input("Calculate new (y)?: ") == "y":
    all_datasets = Dataset.list_datasets()
    results = []

    for d_name in all_datasets:
        try:
            data_dictionary = {
                "name": d_name
            }
            d = Dataset(data_dictionary)
            res = errorAPI.Profiler.dataset_profiler(d)
            res["name"] = d_name
            results.append(res)
        except:
            print("Error..")
    dataset_profiles = pd.DataFrame.from_dict(results)
else:
    with open('dataset_profiles.p', 'rb') as handle:
        dataset_profiles = pickle.load(handle)

Calculate new (y)?: y
Profiling dataset beers...
Profiling dataset company...
Profiling dataset eeg_major...
Profiling dataset eeg_minor...
Profiling dataset eeg_uniform...
Profiling dataset flights...
Profiling dataset hospital...
Profiling dataset kdd_major...


In [None]:
if input("Save the results (y)?: ") == "y":
    with open('dataset_profiles.p', 'wb') as handle:
        pickle.dump(dataset_profiles, handle)

## Feature normalization & PCA

In [None]:
pca = PCA(n_components=2)
feat_columns = [x for x in dataset_profiles.columns if "name" not in x]

X_feat = dataset_profiles[feat_columns]

X_feat_norm = normalize(X_feat)

principalComponents = pca.fit_transform(X_feat_norm)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['pc 1', 'pc 2'])
plt.rcParams["figure.figsize"] = (20,10)

## Plot 2d PCA of the dataset profiles

In [None]:
fig, ax = plt.subplots()
ax.scatter(principalDf["pc 1"], principalDf["pc 2"])

for i, txt in enumerate(dataset_profiles["name"]):
    ax.annotate(txt, (principalDf["pc 1"][i], principalDf["pc 2"][i]))

## Compare some close datasets

In [None]:
if input("Display? (y): ") == "y":
    display(Dataset("rayyan", False).dataframe.head())
    display(Dataset("movies", False).dataframe.head())

## Load the performance results

In [None]:
sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'

performance_results = pd.read_sql_table("results", create_engine(sql_string))

In [None]:
number_of_strategies = performance_results.groupby(["tool_name", "tool_configuration"]).ngroups
print("Number of strategies:", number_of_strategies)

In [None]:
f1_threshold = 0.05

group = performance_results.groupby(["tool_name", "tool_configuration"])
new_group =  group.filter(lambda x: x['cell_f1'].mean() < f1_threshold).groupby(["tool_name", "tool_configuration"])
number_of_filtered_strategies = new_group.ngroups
print("Number of filtered strategies:", number_of_filtered_strategies)

## Max performance per dataset

In [None]:
maxbudget_a = widgets.FloatText(description="Human cost")
maxbudget_b = widgets.FloatSlider(description="Human cost")
maxbudget_link = widgets.jslink((maxbudget_a, 'value'), (maxbudget_b, 'value'))
maxbudget_a.value = 50
maxruntime = widgets.FloatText(description="Max runtime")
maxruntime.value = 100000

min_prec = widgets.FloatText(description="Min precision", min=0, max=1.0, step=0.05)
min_rec = widgets.FloatText(description="Min recall", min=0, max=1.0, step=0.05)
min_f1 = widgets.FloatText(description="Min F1", min=0, max=1.0, step=0.05)


def display_result(obj):
    clear_output(wait=True)
    display(maxbudget_a,maxbudget_b)
    display(maxruntime)
    
    display(min_prec)
    display(min_rec)
    display(min_f1)
    
    performance_results_filtered = performance_results[performance_results["human_cost"].fillna(0) <= maxbudget_a.value]
    performance_results_filtered = performance_results_filtered[performance_results_filtered["runtime"] <= maxruntime.value]
    
    performance_results_filtered = performance_results_filtered[performance_results_filtered["cell_prec"] >= min_prec.value]
    performance_results_filtered = performance_results_filtered[performance_results_filtered["cell_rec"] >= min_rec.value]
    performance_results_filtered = performance_results_filtered[performance_results_filtered["cell_f1"] >= min_f1.value]

    max_idx = performance_results_filtered.groupby(['dataset'])['cell_f1'].transform(max) == performance_results_filtered['cell_f1']
    display(performance_results_filtered[max_idx].drop_duplicates("dataset"))
    filtered_keys = performance_results_filtered.index

    
display_result(None)

maxbudget_a.observe(display_result)
maxbudget_b.observe(display_result)
maxruntime.observe(display_result)

min_prec.observe(display_result)
min_rec.observe(display_result)
min_f1.observe(display_result)

# Estimation of the performance

## Choosing the regressor

In [None]:
all_configs = performance_results.groupby(["tool_name", "tool_configuration"]).groups.keys()

results = {}

# Comment following line to re-test the regressors
results = {'LR': 82.17617899805329, 'KNR': 999999999, 'RR': 70.30382126855443, 'BRR': 59.598055857113444, 'DTR': 24.10530237808723, 'SVR': 21.521600320234903, 'GBR': 17.057838379461177, 'ABR': 22.205547043776853, 'MLR': 160.0334744185594}

normalize = True
pca = -1


if len(results) == 0:

    for regressor in errorAPI.Profiler.available_regressors:
        profiler = errorAPI.Profiler(regressor, normalize, pca)
        profiler.train_all_configs(all_configs, dataset_profiles, performance_results)
        print("Regressor:", regressor)
        MSE = profiler.get_MSE()
        print("MSE:", MSE)
        results[regressor] = MSE
        print()
        print("-="*10)
        print()

print(results)

In [None]:
best_regressor = min(results, key=results.get)
print("The best regressor to estimate the performance is:", best_regressor)

In [None]:
profiler = errorAPI.Profiler(best_regressor)
profiler.train_all_configs(all_configs, dataset_profiles, performance_results)


## Leave on out results of the regressor

In [None]:
profiler.get_top_n_real("beers", 10)

In [None]:
profiler.get_top_n_estimated("beers", 10)

## Now estimating a "new" dataset profile

In [None]:
new_set = Dataset("beers")
profiler.new_estimated_top(new_set, 10)

## Scoring the ranking

In [None]:
number_of_results = 5

In [None]:
for dataset_name in Dataset.list_datasets():
    print("-"*5, dataset_name, "-"*5)
    try:
        estimated_performance_top = profiler.get_top_n_estimated(dataset_name, number_of_results)
        real_performance_top = profiler.get_top_n_real(dataset_name, number_of_results)

        estimated_performance_list = list(estimated_performance_top.index)
        estimated_performance_list.reverse()
        ranking_results = []

        real_rank = 0
        for config_key in real_performance_top.index:
            real_rank += 1
            if config_key in estimated_performance_list:
                rel_i = (estimated_performance_list.index(config_key) + 1) / len(estimated_performance_list)
            else:
                rel_i = 0

            best_rel_i = (len(real_performance_top) - real_rank + 1) / len(real_performance_top)
            
            score = (2**rel_i - 1) / math.log2(real_rank + 1)
            best_score = (2**best_rel_i - 1) / math.log2(real_rank + 1)
            
            ranking_results.append({"config": config_key, "rel_i": rel_i, "best_rel": best_rel_i, "real_rank": real_rank, "score": score, "best_score": best_score})

        ranking_df = pd.DataFrame(ranking_results)
        dcg_rank = ranking_df["score"].sum()
        idcg_rank = ranking_df["best_score"].sum()
        ndcg_rank = dcg_rank / idcg_rank
        print("DCG:", dcg_rank)
        print("nDCG:", ndcg_rank)
    except:
        print("Not calculated")
        
ranking_df