# Auto-Sklearn Output Analysis

## Overview<a id=ov>
1. [Functions](#funcs)<br>
2. [Auto-Sklearn 1.0](#ask1)<br>
    2.1 [kc1](#1_kc1)<br>
    2.2 [electrictiy](#2_elec)<br>

In [1]:
import sys
sys.path.append("../")

In [2]:
import os
import pickle
import itertools
from os.path import join as pjoin
import graphviz
import json

import pandas as pd
import pandas
import sklearn.metrics

from utils.helpers import get_project_root
import pickle
import numpy as np 


In [3]:
ROOT = get_project_root()
OUTPUTS = pjoin(ROOT, "automl_outputs")
SEED = 662873

DIR_TEMPLATE = pjoin(
    OUTPUTS,
    "autosklearn1-ds_{d_name}-seed_{seed}"
)

RUN_HISTORY = pjoin(DIR_TEMPLATE, "smac3-output/run_{seed}/runhistory.json")

AUTOML_OBJ = pjoin(DIR_TEMPLATE, "autosklearn_obj.pkl")

VISUALS = pjoin(DIR_TEMPLATE, "visuals")

## 1. Functions<a id=funcs>

[back to overview](#ov)

In [4]:
def load_run_history(path):
    with open(path, "r") as infile:
        return json.load(infile)


In [5]:
def create_run_df(rh: dict) -> pandas.DataFrame:
    id_ = []
    task_id = []
    status = []
    num_run = []
    config_origin = []

    run_df = pd.DataFrame()

    for i in rh["data"]:
        id_.append(i[0][0])
        task_id.append(json.loads(i[0][1])["task_id"]) # dict is saved as a string
        status.append(i[1][2]["__enum__"])

        if (status[-1] == "StatusType.TIMEOUT" ) or (status[-1] == "StatusType.CRASHED") or (status[-1] == "StatusType.STOP") or (status[-1] == "StatusType.MEMOUT"):
            num_run.append(None)
        else:
            num_run.append(int(i[1][5]["num_run"]))

        if (status[-1] == "StatusType.STOP"):

            config_origin.append(None)
        else:
            config_origin.append(i[1][5]["configuration_origin"])


    run_df["id"] = id_
    run_df["task_id"] = task_id
    run_df["status"] = status
    run_df["num_run"] = num_run
    run_df["config_origin"] = config_origin

    return run_df


In [6]:
clfs_names_map = {
    "adaboost": "AdaBoostingClassifier",
    "bernoulli_nb": "BernoulliNB",
    "decision_tree": "DecisionTreeClassifier",
    "extra_trees": "ExtraTreesClassifier",
    "gaussian_nb" : "GaussianNB",
    "gradient_boosting" : "GradientBoostingClassifier",
    "k_nearest_neighbors" : "KNeighborsClassifier",
    "lda" : "LinearDiscriminantAnalysis",
    "liblinear_svc" : "LinearSVC",
    "libsvm_svc" : "SVC",
    "mlp" : "MLPClassifier",
    "multinomial_nb" : "MultinomialNB",
    "passive_aggressive": "PassiveAggressiveClassifier",
    "qda" : "QuadraticDiscriminantAnalysis",
    "random_forest" : "RandomForestClassifier",
    "sgd" : "SGDClassifier"

}

feat_preproc_name_map = {
    "extra_trees_preproc_for_classification": "SelectFeatsFromExtraTrees",
    "fast_ica": "FastICA",
    "feature_agglomeration": "FeatureAgglomeration",
    "kernel_pca":"KernelPCA",
    "kitchen_sinks": "RBFSampler",
    "liblinear_svc_preprocessor": "SelectFeatsFromLinearSVC",
    "no_preprocessing": "no_preprocessing",
    "nystroem_sampler": "Nystroem",
    "pca": "PCA",
    "polynomial": "PolynomialFeatures",
    "random_trees_embedding": "RandomTreesEmbedding",
    "select_percentile_classification": "SelectPercentile",
    "select_rates_classification" : "SelectRate"
}

cat_data_preproc_name_map = {
    "category_coalescence": "CategoryCoalescence",
    "one_hot_encoding" : "OneHotEncoder",
    "encoding" : "OrdinalEncoder"
}

text_data_preproc_name_map = {
    "text_feature_reduction": "TruncatedSVD",
    "tfidf_encoding" : "TfidfVectorizer"
}


mum_data_preproc_name_map = {
    "impute": "Imputation",
    "minmax" : "MinMaxScaler",
    "normalize" : "Normalizer",
    "power_transformer" : "PowerTransformer",
    "quantile_transformer": "QuantileTransformer",
    "robust_scaler" : "RobustScaler",
    "standardize" : "StandardScaler"
}


In [7]:
def substitute_component_name(name : str, mapping_dict: dict )-> str:
    return mapping_dict[name]

In [8]:
def get_classifier(conf:dict, names_map: dict = clfs_names_map) -> str:
    return substitute_component_name(conf["classifier:__choice__"], names_map)

def get_feat_preproc(conf:dict, f_preproc_names_map: dict = feat_preproc_name_map)-> str:
    feat_preproc = conf[ "feature_preprocessor:__choice__"]
    if feat_preproc == "no_preprocessing":
        return None
    else:
        return substitute_component_name(feat_preproc, f_preproc_names_map)


def get_cat_data_preprocs(conf:dict, cat_preproc_name_map : dict = cat_data_preproc_name_map)-> list:
    steps = []
    category_coalescence = conf.get("data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__")
    if (category_coalescence is not None) and (category_coalescence != "no_coalescence"):
        steps.append(substitute_component_name("category_coalescence", cat_preproc_name_map))

    encoding = conf.get("data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__")
    if (encoding != "no_encoding") and (encoding is not None):
        
        steps.append(substitute_component_name(encoding, cat_preproc_name_map))

    return steps

def get_text_data_preprocs(conf:dict, text_preproc_name_map: dict = text_data_preproc_name_map)-> list:
    steps = []

    text_encoding = conf.get("data_preprocessor:feature_type:text_transformer:text_encoding:__choice__")
    if text_encoding:
        steps.append(substitute_component_name(text_encoding, text_preproc_name_map))

    text_feature_reduction = conf.get("data_preprocessor:feature_type:text_transformer:text_feature_reduction:n_components")
    if text_feature_reduction:
        steps.append(substitute_component_name("text_feature_reduction", text_preproc_name_map))

    return steps


def get_num_data_preprocs(conf:dict, num_preproc_name_map : dict = mum_data_preproc_name_map)-> list:
    steps = []

    imputation = conf.get("data_preprocessor:feature_type:numerical_transformer:imputation:strategy")
    if imputation:
        imputation = substitute_component_name("impute", num_preproc_name_map)
        steps.append(imputation)

    scaling = conf.get("data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__")
    if (scaling is not None) and (scaling != "none"):
        scaling = substitute_component_name(scaling, num_preproc_name_map)
        steps.append(scaling)

    return steps




def get_pipeline(conf:dict)-> list:
    # categorical numeric! text
    pipeline = []

    cat_data_preprocs = get_cat_data_preprocs(conf)
    for p in cat_data_preprocs:
        if p is not None:
            pipeline.append(p)

    num_data_preproc = get_num_data_preprocs(conf)
    for p in num_data_preproc:
        if p is not None:
            pipeline.append(p)

    text_data_preprocs = get_text_data_preprocs(conf)
    for p in text_data_preprocs:
        if p is not None:
            pipeline.append(p)

    feat_preproc = get_feat_preproc(conf)
    if feat_preproc is not None:
        pipeline.append(feat_preproc)

    clf = get_classifier(conf)
    pipeline.append(clf)

    return pipeline

In [9]:
def get_pipeline_from_run_history(row : pandas.Series, rh : dict)-> list:
    id_ = row["id"]
    pipeline = get_pipeline(rh["configs"][str(id_)])

    return pipeline


In [10]:
def add_component_types_cols(df: pandas.DataFrame) -> pandas.DataFrame:
    def mapper(row: pandas.Series, c_names: list) -> list:
        pipeline =row["pipeline"]
        return list(filter(lambda x: x in c_names, pipeline))

    df["clfs"] = df.apply(mapper, axis=1, args=(list(clfs_names_map.values()),))
    df["n_clfs"] = df["clfs"].map(len)

    df["feat_preproc"] = df.apply(mapper, axis=1, args=(list(feat_preproc_name_map.values()),))
    df["n_feat_preproc"] = df["feat_preproc"].map(len)

    df["cat_preproc"] = df.apply(mapper, axis=1, args=(list(cat_data_preproc_name_map.values()),))
    df["n_cat_preproc"] = df["cat_preproc"].map(len)

    df["text_preproc"] = df.apply(mapper, axis=1, args=(list(text_data_preproc_name_map.values()),))
    df["n_text_preproc"] = df["text_preproc"].map(len)

    df["num_preproc"] = df.apply(mapper, axis=1, args=(list(mum_data_preproc_name_map.values()),))
    df["n_num_preproc"] = df["num_preproc"].map(len)


    return df


def build_edges(df: pandas.DataFrame) -> pandas.DataFrame:

    def edges_builder(pipeline):
        edges = []
        edges.append(["Input Data", pipeline[0]])

        for i in range(0, len(pipeline) -1):
            edges.append(
                [pipeline[i], pipeline[i+1]]
            )
        return edges

    df["edges"] = df["pipeline"].map(edges_builder)

    return df

In [11]:
def get_automl_obj(path: str):

    with open(path, "rb") as infile:
        return pickle.load(infile)


def get_leaderboard(automl_obj) -> pandas.DataFrame:

    lb = automl_obj.leaderboard()

    return lb.sort_values("ensemble_weight", ascending=False)


def add_test_score_to_leaderboard(lb: pandas.DataFrame, test_data: pandas.DataFrame, automl_obj) -> pandas.DataFrame:
    scores = []
    X = test_data.drop("target", axis=1)
    y = test_data["target"].values.ravel()
    pipelines = automl_obj.get_models_with_weights()

    for p in pipelines:
        _,p = p
        pred = p.predict(X)
        acc = sklearn.metrics.accuracy_score(y, pred)
        scores.append(acc)

    lb["test_accuracy"] = scores
    return lb

def get_test_data(d_name: str) -> pandas.DataFrame:
    path = pjoin(ROOT, f"datasets/{d_name}/test.csv")
    return pd.read_csv(path)




In [12]:
def visualize_pipeline(edges: list, out_name: str ,
              save_dir: str = None,
              graph_attrs: dict=None,
              view: bool = False,
              format="pdf") -> graphviz.Digraph:

    if graph_attrs is None:
        graph_attr= {'rankdir':'LR'}

    dot = graphviz.Digraph(out_name,
                      graph_attr=graph_attr)
    dot.format = format
    dot.edge_attr.update(arrowhead='vee', arrowsize='1.4')

    nodes = itertools.chain(*edges)
    for n in nodes:
        if "Input Data" in n:
            dot.node(n,  shape="cylinder", height="1.1")
        else:
            dot.node(n, height = "1.1")

    dot.edges(edges)

    dot.render(directory=save_dir, view=view)
    
    
    
def visualize(run_df, name):
    vis_dir = VISUALS.format(d_name=name, seed = SEED)

    
    for _, row in run_df.iterrows():
        edges = row["edges"]
        try: 
            num_run = int(row["num_run"])
        except ValueError:
            if np.isnan(row["num_run"]):
                num_run = "no_run_" + str(row["id"])

        in_ensemble = row["in_ensemble"]

        out_name = str(num_run) 

        if in_ensemble is True:
            out_name = "ens_" + out_name 


        visualize_pipeline(edges = edges, out_name = out_name,
                  save_dir = vis_dir, format="pdf")

In [13]:
all_components = []

for i in [
    mum_data_preproc_name_map.values(),
    text_data_preproc_name_map.values(),
    cat_data_preproc_name_map.values(),
    feat_preproc_name_map.values(),
    clfs_names_map.values()]:
    
    all_components.extend(i)
    
dummy_components = pd.DataFrame(columns = all_components)


def get_sum_components(runs: pandas.DataFrame, 
                       dummy_components: pandas.DataFrame = dummy_components) -> pandas.DataFrame:
    
    dummy = dummy_components.copy()
    
    for c in dummy:
        dummy[c] = runs["pipeline"].map(lambda x: 1 if c in x else 0)
        
    sums =dummy.sum()
    sums = sums.to_frame()
    sums = sums.reset_index(drop = False)
    return sums

# 2. Auto-Sklearn 1.0 <a id=ask1>

[back to overview](#ov)

##  kc1<a id=1_kc1>
    
[back to overview](#ov)

In [14]:
def summerize_run(name):
    rh_path = RUN_HISTORY.format(d_name=name, seed= SEED)
    
    rh = load_run_history(rh_path)
    run_df = create_run_df(rh)
    
    run_df["pipeline"] = run_df.apply(get_pipeline_from_run_history, axis = 1, args=(rh,))
    run_df["n_components"] = run_df["pipeline"].map(len)
    run_df = add_component_types_cols(run_df)
    run_df = build_edges(run_df)
    
    
    run_df["success"] = run_df["status"].map(lambda x: 1 if x == "StatusType.SUCCESS" else 0)
    run_df["timeout"] = run_df["status"].map(lambda x: 1 if x == "StatusType.TIMEOUT" else 0)
    run_df["crashed"] = run_df["status"].map(lambda x: 1 if x == "StatusType.CRASHED" else 0)
    run_df["stop"] = run_df["status"].map(lambda x: 1 if x == "StatusType.STOP" else 0)
    run_df["memout"] = run_df["status"].map(lambda x: 1 if x == "StatusType.MEMOUT" else 0)
    
    
    summary = run_df.groupby("n_components").agg(
    
    n_sucess = ("success", "sum"),
    n_timeout = ("timeout", "sum"),
    n_crashed = ("crashed", "sum"),
    n_stop = ("stop", "sum"),
    n_memout = ("memout", "sum"),
    )

    summary = summary.reset_index(drop=False)
    
    
    
    automl_obj = get_automl_obj(AUTOML_OBJ.format(d_name = name, seed = SEED))

    lb = get_leaderboard(automl_obj)

    lb["type"] = lb["type"].map(clfs_names_map)
    test = get_test_data(name)
    lb = add_test_score_to_leaderboard(lb, test, automl_obj)

    lb = lb.join(run_df[["num_run", "n_components"]].set_index("num_run"), on = "model_id")
    
    run_stats = pd.read_csv(
        pjoin(DIR_TEMPLATE.format(d_name=name, seed=SEED),
              "run_stats.csv"
              )
    )
    run_df["in_ensemble"] = run_df["id"].map(lambda x: True if x in list(lb.index) else False)
    
    
    return run_df, lb, run_stats, summary
    
    

kc1_run_df, kc1_lb, kc1_run_stats, kc1_summary = summerize_run("kc1")    

In [15]:
kc1_run_df.head()

Unnamed: 0,id,task_id,status,num_run,config_origin,pipeline,n_components,clfs,n_clfs,feat_preproc,...,n_text_preproc,num_preproc,n_num_preproc,edges,success,timeout,crashed,stop,memout,in_ensemble
0,1,badd0698-f174-11ed-b475-651cb4a03031,StatusType.SUCCESS,2.0,Initial design,"[Imputation, StandardScaler, RandomForestClass...",3,[RandomForestClassifier],1,[],...,0,"[Imputation, StandardScaler]",2,"[[Input Data, Imputation], [Imputation, Standa...",1,0,0,0,0,False
1,2,badd0698-f174-11ed-b475-651cb4a03031,StatusType.SUCCESS,3.0,Initial design,"[Imputation, PolynomialFeatures, RandomForestC...",3,[RandomForestClassifier],1,[PolynomialFeatures],...,0,[Imputation],1,"[[Input Data, Imputation], [Imputation, Polyno...",1,0,0,0,0,False
2,3,badd0698-f174-11ed-b475-651cb4a03031,StatusType.SUCCESS,4.0,Initial design,"[Imputation, PolynomialFeatures, RandomForestC...",3,[RandomForestClassifier],1,[PolynomialFeatures],...,0,[Imputation],1,"[[Input Data, Imputation], [Imputation, Polyno...",1,0,0,0,0,False
3,4,badd0698-f174-11ed-b475-651cb4a03031,StatusType.TIMEOUT,,Initial design,"[Imputation, RobustScaler, SelectPercentile, SVC]",4,[SVC],1,[SelectPercentile],...,0,"[Imputation, RobustScaler]",2,"[[Input Data, Imputation], [Imputation, Robust...",0,1,0,0,0,False
4,5,badd0698-f174-11ed-b475-651cb4a03031,StatusType.SUCCESS,6.0,Initial design,"[Imputation, StandardScaler, SGDClassifier]",3,[SGDClassifier],1,[],...,0,"[Imputation, StandardScaler]",2,"[[Input Data, Imputation], [Imputation, Standa...",1,0,0,0,0,False


In [16]:
kc1_lb

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration,test_accuracy,n_components
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
509,10,0.2,RandomForestClassifier,0.129187,2.785563,0.862559,3
17,1,0.14,ExtraTreesClassifier,0.125997,7.624503,0.862559,4
1004,19,0.08,PassiveAggressiveClassifier,0.132376,1.021175,0.872038,4
1478,16,0.04,LinearSVC,0.130781,3.914483,0.867299,4
119,25,0.04,RandomForestClassifier,0.133971,2.369914,0.862559,3
1458,3,0.04,LinearSVC,0.125997,3.256222,0.85782,4
63,26,0.04,RandomForestClassifier,0.133971,4.90098,0.867299,4
191,21,0.04,RandomForestClassifier,0.132376,4.622224,0.867299,4
1366,14,0.04,LinearSVC,0.130781,3.224513,0.867299,4
1379,23,0.02,LinearSVC,0.132376,2.568855,0.848341,4


In [17]:
kc1_run_stats

Unnamed: 0,metric,best_valid_score,num_runs,num_success,num_crash,num_timeout,num_memout,total_time,test_score,n_pipelines_in_ensemble
0,accuracy,0.874003,1491,1392,97,2,0,14688.954838,0.867299,26


In [18]:
kc1_summary

Unnamed: 0,n_components,n_sucess,n_timeout,n_crashed,n_stop,n_memout
0,2,8,1,0,0,0
1,3,205,0,1,0,0
2,4,1179,1,96,1,0


In [19]:

visualize(kc1_run_df, "kc1")    

##  electricity <a id=2_elec>
[back to overview](#ov)


In [20]:
elec_vis_dir = VISUALS.format(d_name="electricity", seed = SEED)

In [21]:
elec_run_df, elec_lb, elec_run_stats, elec_summary = summerize_run("electricity")    

In [22]:
elec_run_stats

Unnamed: 0,metric,best_valid_score,num_runs,num_success,num_crash,num_timeout,num_memout,total_time,test_score,n_pipelines_in_ensemble
0,accuracy,0.942191,236,228,0,8,0,14907.704016,0.938438,19


In [23]:
elec_summary

Unnamed: 0,n_components,n_sucess,n_timeout,n_crashed,n_stop,n_memout
0,2,2,0,0,0,0
1,3,44,1,0,0,0
2,4,182,7,0,0,0


In [24]:
visualize(elec_run_df, "electricity")    

##  splice <a id=3_splice>
[back to overview](#ov)


In [25]:
splice_run_df, splice_lb, splice_run_stats, splice_summary = summerize_run("splice")    

In [26]:
splice_run_stats

Unnamed: 0,metric,best_valid_score,num_runs,num_success,num_crash,num_timeout,num_memout,total_time,test_score,n_pipelines_in_ensemble
0,accuracy,0.969409,276,256,5,11,4,14478.243268,0.968652,11


In [27]:
splice_summary

Unnamed: 0,n_components,n_sucess,n_timeout,n_crashed,n_stop,n_memout
0,2,11,0,0,0,0
1,3,81,3,1,0,0
2,4,164,8,4,0,4


In [28]:
visualize(splice_run_df, "splice")    

##  4. APSFailure <a id=aps>
[back to overview](#ov)


In [29]:
aps_run_df, aps_lb, aps_run_stats, aps_summary = summerize_run("APSFailure")    

In [30]:
aps_run_stats

Unnamed: 0,metric,best_valid_score,num_runs,num_success,num_crash,num_timeout,num_memout,total_time,test_score,n_pipelines_in_ensemble
0,accuracy,0.994108,55,41,0,10,4,14407.813382,0.992632,11


In [31]:
visualize(aps_run_df, "APSFailure")

##  5. volkert <a id=volker>
[back to overview](#ov)


In [32]:
aps_run_df, aps_lb, aps_run_stats, aps_summary = summerize_run("volkert")    

In [33]:
visualize(aps_run_df, "volkert")

##  6. volkert <a id=gas-drift>
[back to overview](#ov)


In [36]:
gas_run_df, gas_lb, gas_run_stats, gas_summary = summerize_run("gas-drift")    

In [37]:
visualize(gas_run_df, "gas-drift")