In [None]:
import itertools
import os
import graphviz
import json

import pandas as pd

## Functions

In [None]:
def load_and_deserialize_json(run_dir: str, file_name:str)-> list:
    """
    Loads each pipeline from "structures.json" and returns them as a list of dictionaries
    :param run_dir: str run directory
    :param file_name: str json file name
    :return: list of dictionaries (pipelines)
    """
    lines = []
    for line in open(os.path.join(run_dir, f"{file_name}.json"), "r"):
        lines.append(json.loads(line))
    return lines

In [None]:
def create_pipelines(pipelines_list: list) -> list:
    """
    C
    :param pipelines_list:
    :return:
    """
    #
    pipelines = []

    for p in pipelines_list:
        steps = p["pipeline"]["args"]["steps"]
        pipe = []
        for s in steps:
            pipe.append(s[1]["clazz"])
        pipelines.append(pipe)

    return pipelines

In [None]:
data_preproc  ={
    "ImputationComponent": "Imputation",
    "KNNImputerComponent": "KNNImputation",
    "MaxAbsScalerComponent": "MaxAbsScaler",
    "MinMaxScalerComponent": "MinMaxScaler",
    "NormalizerComponent": "Normalizer",
    "QuantileTransformerComponent": "QuantileTransformer",
    "RobustScalerComponent": "RobustScaler",
    "StandardScalerComponent":"StandardScaler"
}

feat_preproc = {
    "BernoulliRBM": "BernoulliRBM",
    "BinarizerComponent": "Binarizer",
    "FactorAnalysisComponent": "FactorAnalysis",
    "FastICAComponent": "FastICA",
    "FeatureAgglomerationComponent": "FeatureAgglomeration",
    "GenericUnivariateSelectComponent":"GenericUnivariateSelect",
    "KBinsDiscretizer": "KBinsDiscretizer",
    "KernelPCAComponent": "KernelPCA",
    "MissingIndicatorComponent": "MissingIndicator",
    "OneHotEncoderComponent": "OneHotEncoder",
    "OrdinalEncoderComponent": "OrdinalEncoder",
    "PCAComponent": "PCA",
    "PolynomialFeaturesComponent": "PolynomialFeatures",
    "RandomTreesEmbeddingComponent": "RandomTreesEmbedding",
    "SelectKBestComponent": "SelectKBest",
    "SelectPercentileClassification": "SelectPercentile",
    "TruncatedSVDComponent": "TruncatedSVD",
    "VarianceThresholdComponent": "VarianceThreshold"
}

clfs = {
    "AdaBoostingClassifier": "AdaBoostingClassifier",
    "BernoulliNB": "BernoulliNB",
    "DecisionTree": "DecisionTreeClassifier",
    "GradientBoostingClassifier": "GradientBoostingClassifier",
    "LibSVM_SVC": "SVC",
    "LinearDiscriminantAnalysis": "LinearDiscriminantAnalysis",
    "MultinomialNB": "MultinomialNB",
    "RandomForest": "RandomForestClassifier",
    "SGDClassifier": "SGDClassifier"
}




In [None]:
def map_component_names(pipelines: list,
                        data_preproc: dict = data_preproc,
                        feat_preproc: dict = feat_preproc,
                        clfs: dict = clfs) -> list:

    """
    changes the names of each pipeline component accoring to data_preproc, feat_preproc, clfs
    :param pipelines: list of pipelines (each is a list)
    :param data_preproc: dict for mapping data preprocessing component names
    :param feat_preproc: dict for mapping feature preprocessing component names
    :param clfs: dict for mapping classifier names
    :return: list of pipelines with mapped names
    """

    def pipeline_mapper(pipeline):

        def component_mapper(component):
            component = component.split(".")
            if "data_preprocessing" in component:
                return data_preproc[component[-1]]
            elif "feature_preprocessing" in component:
                return feat_preproc[component[-1]]
            elif "classification" in component:
                return clfs[component[-1]]

        return list(map(component_mapper, pipeline))


    return list(map(pipeline_mapper , pipelines))


In [None]:
def build_edges(pipelines: list) -> list:

    def get_edges(pipeline):
        p_edges = []
        p_edges.append(["Input Data", pipeline[0]])
        for ix in range(0, len(pipeline)-1):
            p_edges.append([pipeline[ix], pipeline[ix+1]])
        return p_edges

    return list(map(get_edges , pipelines))

In [None]:
def get_components_by_type(pipeline: list,
                           data_preproc: list = list(data_preproc.values()),
                           feat_preproc: list = list(feat_preproc.values()),
                           clfs: list = list(clfs.values())
                           ) -> (list, list, list):
    """
    Returns the coomponents of a pipeline based on their types; classifier, data preprocessor, feature preprocessor

    :param raw_coponents: list of pipeline components
    :return: list of classifiers, list of preprocessors, list of selectors

    :param pipeline: list of pipeline components
    :param data_preproc: list of data preprocessor  names
    :param feat_preproc: list of feature preprocessor  names
    :param clfs: list of classifier names
    :return: list of classifiers, list of data preprocessors, list of feature preprocessors
    """

    classifiers = list(filter(lambda x: x in clfs, pipeline))
    data_preproc_components = list(filter(lambda x: x in data_preproc, pipeline))
    feature_preproc_components = list(filter(lambda x: x in feat_preproc, pipeline))

    return classifiers, data_preproc_components, feature_preproc_components



In [None]:
def visualize(edges: list, out_name: str ,
              save_dir: str = None,
              graph_attrs: dict=None,
              view: bool = False,
              format="pdf") -> graphviz.Digraph:

    if graph_attrs is None:
        graph_attr= {'rankdir':'LR'}

    dot = graphviz.Digraph(out_name,
                      graph_attr=graph_attr)
    dot.format = format
    dot.edge_attr.update(arrowhead='vee', arrowsize='1.4')

    nodes = itertools.chain(*edges)
    for n in nodes:
        if "Input Data" in n:
            dot.node(n,  shape="cylinder", height="1.1")
        else:
            dot.node(n, height = "1.1")

    dot.edges(edges)


    dot.render(directory=save_dir, view=view)

In [None]:
def get_results_dict(results:list) -> dict:
    results_dict = {}
    for r in results:
        key = str(r[0][1]) + "-" + str(r[0][2])
        results_dict[key] = r[1:]
    return results_dict


def get_runs_status(results: dict, n_pipelines: int)-> None:

    all_runs = []
    for i in range(0, n_pipelines):
        runs_status = []
        n = 0
        while results.get(f"{i}-{n}"):
            run_dict = results.get(f"{i}-{n}")[0]
            runs_status.append(run_dict["status"])
            n+=1
        all_runs.append(runs_status)

    return all_runs


def get_n_sucess_timeout_and_crashed_runs(run_Status:list) -> (list, list, list):
    success= []
    timeout = []
    crashed = []

    for p in run_Status:
        success.append(
            len(list(filter(lambda x: x == "SUCCESS", p)))
        )
        timeout.append(
            len(list(filter(lambda x: x == "TIMEOUT", p)))
        )
        crashed.append(
            len(list(filter(lambda x: x == "CRASHED", p)))
        )
    return success, timeout, crashed



In [None]:
def summerize_and_visualize_run(run_dir, save_graph_as="pdf"):


    stats = {
        "n_comp":[],
        # "default_train_score":[],
        # "best_optimized_train_score":[],
        "pipeline":[],
        "clfs": [],
        "d_preproc": [],
        "f_preproc":[],
    }

    structs = load_and_deserialize_json(run_dir, "structures")
    pipelines = create_pipelines(structs)
    pipelines = map_component_names(pipelines)
    edges = build_edges(pipelines)



    print(len(pipelines))
    for ix, p in enumerate(pipelines):
        stats["n_comp"].append(len(p))
        stats["pipeline"].append(p)

        classifiers, data_preproc_components, feature_preproc_components = get_components_by_type(p)
        stats["clfs"].append(classifiers)
        stats["d_preproc"].append(data_preproc_components)
        stats["f_preproc"].append(data_preproc_components)

        save_dir = os.path.join(run_dir, "pipeline_vis")


        visualize(edges = edges[ix], out_name=f"{ix}",save_dir=save_dir, format=save_graph_as)

    results = load_and_deserialize_json(aps, "results")
    results_dict = get_results_dict(results)
    run_status = get_runs_status(results_dict, len(pipelines))
    success, timeout, crashed = get_n_sucess_timeout_and_crashed_runs(run_status)


    stats_df = pd.DataFrame.from_dict(stats)
    stats_df["n_d_preproc"] = stats_df["d_preproc"].map(lambda x: len(x))
    stats_df["n_f_preproc"] = stats_df["f_preproc"].map(lambda x: len(x))
    stats_df["n_stacking_est"] = stats_df["clfs"].map(lambda x: 0 if len(x) == 1 else (len(x) -1 if len(x) > 0 else None))

    stats_df["n_success_runs"] = success
    stats_df["n_crashed_runs"] = crashed
    stats_df["n_timeout_runs"] = timeout

    return stats_df




## APSFailure

In [None]:
aps = "/home/hadi/PycharmProjects/Master-Thesis/automl_outputs/gpuserver/dswizard/dswizard-ds_electricity-seed_662873"
aps_states = summerize_and_visualize_run(aps)
aps_states.shape

In [None]:
aps_states["total_runs"] = aps_states["n_crashed_runs"] + aps_states["n_success_runs"] + aps_states["n_timeout_runs"]

In [None]:
aps_states["total_runs"].sum()

In [None]:
aps_states["n_crashed_runs"].sum()

In [None]:
aps_states["n_success_runs"].sum()



In [None]:
aps_states["n_timeout_runs"].sum()

In [None]:
with open("/home/hadi/PycharmProjects/Master-Thesis/automl_outputs/gpuserver/dswizard/dswizard-ds_volkert-seed_662873/runhistory_123.json", "r") as infile:
    a = json.load(infile)
a.keys()


In [None]:
a["meta"]

In [None]:
nn = 0

for s in a["structures"]:
    nn+= len(s["configs"])

nn

In [None]:
len(a["structures"])

In [None]:
a["explanations"]["structures"].keys()

In [None]:
childreen_dicts = a["explanations"]["structures"]["children"]
details = a["explanations"]["structures"]["details"]


In [None]:
details

In [None]:

f_msgs = []

for d in childreen_dicts:
    d = d["details"]
    print(d.get("00:00"))

    msgs = []

    n=0
    while d.get(f"00:0{n}"):
        msgs.append(d.get(f"00:0{n}")["failure_message"])
        n+=1

    n=0
    while d.get(f"00:1{n}"):
        msgs.append(d.get(f"00:1{n}")["failure_message"])
        n+=1

    f_msgs.append(msgs)



msgs = []

n=0
while details.get(f"00:0{n}"):
    msgs.append(details.get(f"00:0{n}")["failure_message"])
    n+=1

n=0
while details.get(f"00:1{n}"):
    msgs.append(details.get(f"00:1{n}")["failure_message"])
    n+=1


f_msgs.append(msgs)





In [None]:
msgs

In [None]:
f_msgs

In [None]:
len(list(itertools.chain(*f_msgs)))



In [None]:
set(list(itertools.chain(*f_msgs)))


In [None]:
len(list(filter(lambda x: x is None, list(itertools.chain(*f_msgs)))))

In [None]:
len(list(filter(lambda x: x == "Ineffective", list(itertools.chain(*f_msgs)))))


In [None]:
len(list(filter(lambda x: x == "Unvisited", list(itertools.chain(*f_msgs)))))


In [None]:
len(list(filter(lambda x: x == "Missing MF", list(itertools.chain(*f_msgs)))))


In [None]:
192 + 99  + 17 - 308

## Electricity