In [62]:
import sqlite3
import ast
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import numpy as np

In [26]:
db_path = "../results.db"
font_size = 14
font_size_large = 16
METRIC = "r2_score"
optimization = "max"

In [17]:
conn = sqlite3.connect(db_path)
df = pd.read_sql(f"SELECT trials.model as model, trials.{METRIC} as metric, results.data_config_hash as data_config_hash, results.data_id as data_id, fold, trials.timestamp as timestamp FROM trials inner join results on results.search_id = trials.search_id", conn)
df_datasets = pd.read_sql("SELECT * FROM datasets", conn)
conn.close()
df.head(10)

Unnamed: 0,model,metric,data_config_hash,data_id,fold,timestamp
0,AdaBoost,-37.583585,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1762442000.0
1,AdaBoost,-0.162924,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1762442000.0
2,AdaBoost,-218.126485,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1762442000.0
3,AdaBoost,-0.082301,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1762442000.0
4,AdaBoost,-252.362393,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1762442000.0


In [75]:
groups = df.groupby(by=["model", "data_id", "fold", "data_config_hash"])
new_groups = []
convergence_at = defaultdict(list)
failure_to_converge = defaultdict(list)
finished_at = defaultdict(list) 
results = []
thr = 0.90
for (model, data_id, fold, data_config_hash), group_df in groups:
    #TODO print(len(group_df))
    group_df["delta_time"] = group_df["timestamp"] - group_df["timestamp"].min()
    group_df = group_df.sort_values(by="delta_time", ascending=True)
    group_df = group_df.reset_index(drop=True)
    group_df = group_df.reset_index(drop=False)
    if optimization == "max":
        maximum = group_df["metric"].max()
        group_df["best_so_far"] = group_df["metric"].cummax()
        group_df["best_so_far_perc"] = group_df["best_so_far"]/maximum
        convergence_point = len(group_df) - sum(group_df["best_so_far_perc"] >= 0.975)
        final_performance_point = len(group_df) - sum(group_df["best_so_far_perc"] >= 0.99) # Decisions between 2 configs come down to luck now
        not_converged = convergence_point > len(group_df)/2
                                                
        convergence_at[model].append(convergence_point)
        failure_to_converge[model].append(not_converged)
        finished_at[model].append(final_performance_point)
    else:
        minimum = group_df["metric"].min()
        group_df["best_so_far"] = group_df["metric"].cummin()
        group_df["best_so_far_perc"] = group_df["best_so_far"] - minimum

    new_groups.append(group_df)
df_ = pd.concat(new_groups, ignore_index=True)

for model, l in convergence_at.items():
    results.append({
        "Model": model,
        "Finished After (Avg)": sum(finished_at[model])/len(finished_at[model]),
        "Finished After (Med)": np.median(finished_at[model]),
        "Finished After (P90)": float(np.percentile(finished_at[model], 90, interpolation="linear")),
        #"Converged After (Avg)": sum(convergence_at[model])/len(convergence_at[model]), # Not sure if someone cares
        "Failed to Converge": sum(failure_to_converge[model])/len(failure_to_converge[model])
    })
results = pd.DataFrame(results)
results = results.sort_values(by=["Failed to Converge", "Finished After (Avg)"])
results

Unnamed: 0,Model,Finished After (Avg),Finished After (Med),Finished After (P90),Failed to Converge
5,Dummy,0.0,0.0,0.0,0.0
12,LARS,1.088235,0.0,4.0,0.0
17,PartialLeastSquares,8.470588,1.0,34.4,0.0
8,GaussianProcess,17.645161,4.0,48.0,0.0
21,RidgeRegressor,18.735294,4.0,55.9,0.0
19,PolynomialRegression,27.666667,25.0,51.0,0.0
14,LassoLars,47.1875,32.0,101.0,0.0
23,SupportVectorRegression,53.0,2.0,172.6,0.0
7,ExtraTrees,58.612903,11.0,198.0,0.0
2,BayesianRidge,14.428571,0.0,2.6,0.028571


In [None]:
# I need 
# ID of the fold i.e. was it the first second, third fold... -> TO Get median curve
# Model Name for facets
# Time Difference since start ~ Time Stamp for x Axis
# ALL the Metrics for Y Axis

In [None]:
df = pd.read_sql("SELECT * FROM trials LIMIT 5", conn)
df

In [18]:
df

Unnamed: 0,model,metric,data_config_hash,data_id,fold,timestamp
0,AdaBoost,-37.583585,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1.762442e+09
1,AdaBoost,-0.162924,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1.762442e+09
2,AdaBoost,-218.126485,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1.762442e+09
3,AdaBoost,-0.082301,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1.762442e+09
4,AdaBoost,-252.362393,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,30,9,1.762442e+09
...,...,...,...,...,...,...
383204,ExtraTrees,0.998421,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,29,9,1.763261e+09
383205,ExtraTrees,0.997769,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,29,9,1.763261e+09
383206,ExtraTrees,0.997204,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,29,9,1.763261e+09
383207,ExtraTrees,0.998283,2a5bb20661bcb64796e2f4f2eb847266e7feb6cedf57ab...,29,9,1.763261e+09
