In [None]:
import itertools
import os
import pickle
import graphviz

import pandas as pd

In [None]:
kc1 = "/home/hadi/PycharmProjects/Master-Thesis/automl_outputs/gpuserver/tpot/tpot-ds_kc1-seed_662873"
# aps = "/home/hadi/PycharmProjects/Master-Thesis/automl_outputs/gpuserver/tpot/tpot-ds_APSFailure-seed_662873"
gas = "/home/hadi/PycharmProjects/Master-Thesis/automl_outputs/gpuserver/tpot/tpot-ds_gas-drift-seed_662873"
electricity = "/home/hadi/PycharmProjects/Master-Thesis/automl_outputs/gpuserver/tpot/tpot-ds_electricity-seed_662873"


In [None]:
def get_pops(run_dir: str) -> list:
    """Reads the pickle of each saved population and returns a list of all populations
    run_dir is a string path of a tpot run dir
    """
    pops = []
    pops_dir = os.path.join(run_dir, "pops")
    for p in os.listdir(pops_dir):
        p = os.path.join(pops_dir, p)
        if not os.path.isdir(p):

            with open(p, "rb") as in_file:
                pops.append(pickle.load(in_file))

    return pops

In [None]:
def get_components_only(indiv_dict: dict, hp_sep: str ="__")-> (dict, int) :
    """
    Filters the dictionary of pipeline and returns a dictionary with components only (no HPs). HPs are seperated usually by __

    :param indiv_dict: dict of a single pipeline
    :param hp_sep: str; how hyperparameters can be identified
    :return: filtered dict, int number of combine_dfs in pipeline
    """

    # def filter_func(pair):
    #     k, v = pair
    #
    #     if hp_sep not in v:
    #         return True
    #     else:
    #         return False


    filtered_indiv_dict = indiv_dict.copy()
    observed = {}

    for k, v in indiv_dict.items():
        if hp_sep in v:
            filtered_indiv_dict.pop(k)
        else:
            if v not in observed:
                observed[v] = 1
            else:
                observed[v] = observed[v] +1
                filtered_indiv_dict[k] = f"{v} {observed[v]}"

    def combine_ds_exists(pair):
        k, v = pair

        if "CombineDFs" in v:
            return True
        else:
            return False

    return filtered_indiv_dict,  len(dict(filter(combine_ds_exists, filtered_indiv_dict.items())))



In [None]:
def get_named_component_edges(components_dict: dict, all_edges: list) -> list:
    """
    substitutes the component number with its name
    :param components_dict: filtered dict from get_components_only()
    :param all_edges: list of tuples of pipeline edges
    :return: list of lists of named edges
    """

    component_keys = list(components_dict.keys())

    # filter out the hyperparameters
    def not_hyperparameter(edge_tuple):
        if (edge_tuple[0] in component_keys) and (edge_tuple[1] in component_keys):
            return True

    component_edges = list((filter(not_hyperparameter, all_edges)))

    # return component name instead of number
    def map_component_names(edge_tuple):
        return [components_dict[edge_tuple[1]], components_dict[edge_tuple[0]]]


    named_component_edges = list((map(map_component_names, component_edges)))

    return list(reversed(named_component_edges))

In [None]:
def rename_input_matrix(component_edges: list) -> list:
    """
    Renames input_matrix to "Input Data"  or "Input Data Copy". if more than one CombineDFs is observed, they will be numbered.

    :param component_edges: list of lists of named edges from get_named_component_edges()
    :param combine_dfs: integer indicating the number of combine_dfs in the pipeline
    :return: list of lists of edges with input_matrix changed to "Input Data" or "Input Data Copy (i)"
    """


    def rename_single(edge):
        if "input_matrix" in edge[0]:
            if edge[0][-1].isdigit():
                edge[0] = f"Input Data {edge[0][-1]}"
            else:
                edge[0] = "Input Data"

        if "input_matrix" in edge[1]:
            if edge[1][-1].isdigit():
                edge[1] = f"Input Data {edge[1][-1]}"
            else:
                edge[1] = "Input Data"




        return edge

    return list(map(rename_single, component_edges))



In [None]:
def get_raw_components(component_edges: list) -> list:
    """
    returns unique and ordered component names. Only when we have one CombineDFs!

    :param component_edges: list of lists of named edges rename_input_matrix()
    :return: list of unique component names
    """

    raw = []
    for e in component_edges:
        component = e[1]
        if component not in raw:
            raw.append(component)

    return  raw


In [None]:
def visualize(edges: list, out_name: str , save_dir: str = None, graph_attrs: dict=None, view: bool = False, format="pdf") -> graphviz.Digraph:
    if graph_attrs is None:
        graph_attr= {'rankdir':'LR'}




    dot = graphviz.Digraph(out_name,
                      graph_attr=graph_attr)
    dot.format = format
    dot.edge_attr.update(arrowhead='vee', arrowsize='1.4')

    nodes = itertools.chain(*edges)
    for n in nodes:
        if "Input Data" in n:
            dot.node(n,  shape="cylinder", height="1.1")
        else:
            dot.node(n, height = "1.1")

    dot.edges(edges)


    dot.render(directory=save_dir, view=view)


In [None]:
def get_component_type_counts(raw_coponents: list) -> (list, list, list):
    """
    Returns the coomponents of a pipeline based on their types; classifier, preprocessor, selector

    :param raw_coponents: list of pipeline components
    :return: list of classifiers, list of preprocessors, list of selectors
    """
    cls_list = [
        'GaussianNB',
         'BernoulliNB',
         'MultinomialNB',
         'DecisionTreeClassifier',
         'ExtraTreesClassifier',
         'RandomForestClassifier',
         'GradientBoostingClassifier',
         'KNeighborsClassifier',
         'LinearSVC',
         'LogisticRegression',
         'XGBClassifier',
         'SGDClassifier',
         'MLPClassifier'
    ]

    preproc_list= [
        'Binarizer',
        'FastICA',
        'FeatureAgglomeration',
        'MaxAbsScaler',
        'MiMaxScaler',
        'Normalizer',
        'Nystroem',
        'PCA',
        'PolynomialFeatures',
        'RBFSampler',
        'RobustScaler',
        'StandardScaler',
        'ZeroCount',
        'OneHotEncoder'
    ]

    selectors_list = [
        'SelectFwe',
        'SelectPercentile',
        'VarianceThreshold',
        'RFE',
        'SelectFromModel'
    ]




    cls = list(filter(lambda x: x in cls_list, raw_coponents))
    preproc = list(filter(lambda x: x in preproc_list, raw_coponents))
    selectors = list(filter(lambda x: x in selectors_list, raw_coponents))


    return cls, preproc, selectors



In [None]:
def combinedfs_with_input_matrix_count(edges: list) -> list:

    combine_dfs_edges = list(filter(lambda x: ("Input Data" in x[0]) and ("CombineDFs" in x[1]), edges))

    return len(combine_dfs_edges)


In [None]:
def summerize_and_visualize_run(run_dir, save_graph_as="pdf"):
    pops = get_pops(run_dir)


    stats = {
        "pop":[],
        "indv": [],
        "n_comp":[],
        "n_combinedfs":[],
        "score":[],
        "pipeline":[],
        "clfs": [],
        "preproc": [],
        "selectors":[],
        "n_combine_df_with_input_matrix":[]
    }

    for ix_p, p in enumerate(pops):
        save_dir = os.path.join(os.path.join(kc1, "pops"), str(ix_p))

        for ix_indv, indv in enumerate(p):
            stats["pop"].append(ix_p)
            stats["indv"].append(ix_indv)


            components_dict, combine_dfs  = get_components_only(indv[0][2])
            component_edges = get_named_component_edges(components_dict, indv[0][1])
            component_edges = rename_input_matrix(component_edges)
            raw_components = get_raw_components(component_edges)
            visualize(edges = component_edges, out_name=f"{ix_p}_{ix_indv}",save_dir=save_dir, format=save_graph_as)

            cls, preproc, selectors = get_component_type_counts(raw_components)
            stats["n_combine_df_with_input_matrix"].append(combinedfs_with_input_matrix_count(component_edges))


            stats["n_comp"].append(len(raw_components))
            stats["n_combinedfs"].append(combine_dfs)
            stats["score"].append(indv[1][1])
            stats["pipeline"].append(raw_components)

            stats["clfs"].append(cls)
            stats["preproc"].append(preproc)
            stats["selectors"].append(selectors)


    stats_df = pd.DataFrame.from_dict(stats)

    stats_df["n_preproc"] = stats_df["preproc"].map(lambda x: len(x))
    stats_df["n_selectors"] = stats_df["selectors"].map(lambda x: len(x))
    stats_df["n_stacking_est"] = stats_df["clfs"].map(lambda x: 0 if len(x) == 1 else (len(x) -1 if len(x) > 0 else None))


    return stats_df

In [None]:
kc1_stats = summerize_and_visualize_run(kc1, "png")

kc1_stats[(kc1_stats["n_combinedfs"] > 0) & (kc1_stats["n_combine_df_with_input_matrix"] == 0)]

In [None]:
kc1_stats[(kc1_stats["n_combinedfs"] > 0) & (kc1_stats["n_combine_df_with_input_matrix"] == 1)].shape

In [None]:
kc1_stats[(kc1_stats["n_combinedfs"] > 0) & (kc1_stats["n_combine_df_with_input_matrix"] == 2)].shape

In [None]:
kc1_stats[(kc1_stats["n_combinedfs"] > 0) & (kc1_stats["n_combine_df_with_input_matrix"] == 3)].shape

In [None]:
kc1_stats[kc1_stats["n_combine_df_with_input_matrix"] > 0]

In [None]:
electricity_stats[electricity_stats["n_combinedfs"] > 1]

In [None]:
gas_stats = summerize_and_visualize_run(gas, "png")
electricity_stats = summerize_and_visualize_run(electricity, "png")

In [None]:
gas_stats.describe()

In [None]:
electricity_stats

In [None]:
electricity_stats.describe()