In [2]:
#
#  This "preamble" code gets executed automatically when the main Notebook is run:
#

import os
import sys
import time
import json
import pickle
import threading

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn as skl 
from sklearn                 import metrics
from sklearn.preprocessing   import StandardScaler
from sklearn.tree            import DecisionTreeClassifier
from sklearn.tree            import DecisionTreeRegressor
from sklearn.pipeline        import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm             import SVC 


#import ipywidgets as widgets
#from IPython.display import display, HTML

#
# NOTE: Commented out until ipywidgets works on Quartz
#
# Set up the UI:
#trace_progress = widgets.IntProgress(
#    value=0, min=0, max=10, step=1,
#    description='Processing:', orientation='horizontal',
#    bar_style='') # 'success', 'info', 'warning', 'danger' or ''
#trace_summary = widgets.Output()
#trace_detail  = widgets.Output()
#trace_accordion = widgets.Accordion(children=[trace_summary, trace_detail])
#trace_accordion.set_title(0, 'Summary')
#trace_accordion.set_title(1, 'Detail')
#trace_box = widgets.Box(children=[trace_progress, trace_accordion])
#
#plot_progress = widgets.IntProgress(
#    value=0, min=0, max=10, step=1,
#    description='Processing:', orientation='horizontal',
#    bar_style='') # 'success', 'info', 'warning', 'danger' or ''
#plot_summary = widgets.Output()
#plot_detail  = widgets.Output()
#plot_accordion = widgets.Accordion(children=[plot_summary, plot_detail])
#plot_accordion.set_title(0, 'Summary')
#plot_accordion.set_title(1, 'Detail')
#plot_box = widgets.Box(children=[plot_progress, plot_accordion])
#
#tab_nest = widgets.Tab(children=[trace_box, plot_box])
#tab_nest.set_title(0, 'Trace')
#tab_nest.set_title(1, 'Plot')
#tab_nest.selectedIndex = None
#
#display(tabNest)

#trace_thread = threading.Thread(
#    target=project_model_over_trace,
#    args=(data, trace_progress, trace_summary, trace_detail))

#plot_thread = threading.Thread(
#    target=plot_apollo_vs_normal,
#    args=(data))

#trace_thread.start()
#plot_thread.start()





print("""
                \     _ \   _ \  |     |      _ \  
               _ \   |   | |   | |     |     |   | 
              ___ \  ___/  |   | |     |     |   | 
            _/    _\_|    \___/ _____|_____|\___/  
                                                   
           -  -  -  --  --  ---  --= --== ==*# ###>
      """)

def tree_to_data(decision_tree, feature_names=None, name_swap=None, y=None):
    def node_to_data(tree, node_id, criterion):
        if not isinstance(criterion, skl.tree.tree.six.string_types):
            criterion = "impurity"

        value = tree.value[node_id]
        if tree.n_outputs == 1:
            value = value[0, :]

        if tree.children_left[node_id] == skl.tree._tree.TREE_LEAF:
            return {
                "id": node_id,
                "criterion": criterion,
                "impurity": tree.impurity[node_id],
                "samples": tree.n_node_samples[node_id],
                "value": list(value),
                "class": decision_tree.classes_[np.argmax(value)]
            }
        else:
            if feature_names is not None:
                feature = feature_names[tree.feature[node_id]]
            else:
                feature = tree.feature[node_id]

            if "=" in feature:
                ruleType = "="
                ruleValue = "false"
            else:
                ruleType = "<="
                ruleValue = "%.4f" % tree.threshold[node_id]

            return {
                "id": node_id,
                "rule": "%s %s %s" % (feature, ruleType, ruleValue),
                criterion: tree.impurity[node_id],
                "samples": tree.n_node_samples[node_id],
            }

    def recurse(tree, node_id, criterion, parent=None, depth=0):
        left_child = tree.children_left[node_id]
        right_child = tree.children_right[node_id]

        node = node_to_data(tree, node_id, criterion)

        if left_child != skl.tree._tree.TREE_LEAF:
            node["left"] = recurse(tree,
                                   left_child,
                                   criterion=criterion,
                                   parent=node_id,
                                   depth=depth + 1)
            node["right"] = recurse(tree,
                                    right_child,
                                    criterion=criterion,
                                    parent=node_id,
                                    depth=depth + 1)

        return node

    if isinstance(decision_tree, skl.tree.tree.Tree):
        return recurse(decision_tree, 0, criterion="impurity")
    else:
        return recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)

def tree_to_simple_str(decision_tree, feature_names=None, name_swap=None, y=None):
    def node_to_data(tree, node_id, criterion):
        if not isinstance(criterion, skl.tree.tree.six.string_types):
            criterion = "impurity"

        value = tree.value[node_id]
        if tree.n_outputs == 1:
            value = value[0, :]

        if tree.children_left[node_id] == skl.tree._tree.TREE_LEAF:
            return {
                "class": decision_tree.classes_[np.argmax(value)]
            }
        else:
            if feature_names is not None:
                feature = feature_names[tree.feature[node_id]]
            else:
                feature = tree.feature[node_id]

            if "=" in feature:
                ruleType = "="
                ruleValue = "false"
            else:
                ruleType = "<="
                ruleValue = "%.4f" % tree.threshold[node_id]

            return {
                "rule": "%s %s %s" % (feature, ruleType, ruleValue),            
            }

    def recurse(tree, node_id, criterion, parent=None, depth=0):
        left_child = tree.children_left[node_id]
        right_child = tree.children_right[node_id]

        node = node_to_data(tree, node_id, criterion)

        if left_child != skl.tree._tree.TREE_LEAF:
            node["left"] = recurse(tree,
                                   left_child,
                                   criterion=criterion,
                                   parent=node_id,
                                   depth=depth + 1)
            node["right"] = recurse(tree,
                                    right_child,
                                    criterion=criterion,
                                    parent=node_id,
                                    depth=depth + 1)

        return node

    if isinstance(decision_tree, skl.tree.tree.Tree):
        return recurse(decision_tree, 0, criterion="impurity")
    else:
        return recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)


def construct_model_from_flush(data, flush_key):
    # Grab the table
    td = data[flush_key]
    
    # Make the data like the query for online learning
    # TODO
    
    # Filter the refined data
    td['region_name'] = pd.Categorical(td['region_name'])
    td['region_name_id'] = td['region_name'].cat.codes
    #
    name_swap = td[['region_name', 'region_name_id']]\
            .groupby(['region_name', 'region_name_id'], as_index=False, sort=True)\
            .first()

    grp_td = td.groupby(by=['region_name', 'region_name_id', 'num_elements', 'policy_index'],
                            as_index=False).agg({
                                'time_avg':'min'
                            })

    region_names = td['region_name'].unique().tolist()
    unique_policies = grp_td['policy_index'].unique().tolist()
    
    print("    td.shape = %s" % str(td.shape))
    print("grp_td.shape = %s" % str(grp_td.shape))
    print("len(region_names) = %s" % len(region_names))
    print("\nunique_policies = %s\n" % str(unique_policies))
    
    #print(grp_td.to_string())
    
    drop_fields = ['region_name', 'region_name_id', 'policy_index', 'time_avg']

    feature_names = [f for f in grp_td.columns if f not in drop_fields]
    model_count = 0
    
    # Set up the SKL pipeline
    # Build a model for each region
    all_skl_models = {}
    all_types_rule = {}
    all_rules_json = {}
    all_least_json = {}
    all_timed_json = {}
    all_sizes_data = {}

    one_big_tree = False

    print("Training...")
    for region in region_names:
        model_count += 1
    
        if one_big_tree:
            rd = grp_td
            region = "__ANY_REGION__"
        else:
            rd = grp_td[grp_td['region_name'] == region]

        if (rd.shape[0] < 1): 
            continue

        y = rd['policy_index'].astype(int)
        x = rd.drop(drop_fields, axis="columns").values.astype(float)
        
        #example = DecisionTreeClassifier(
        #         class_weight=None, criterion='gini', max_depth=6,
        #         max_features=x.shape[1], max_leaf_nodes=None,
        #         min_impurity_decrease=1e-07, min_samples_leaf=1,
        #         min_samples_split=2, min_weight_fraction_leaf=0.0,
        #         presort=False, random_state=None, splitter='best'))]

        clf = DecisionTreeClassifier(
                 class_weight=None, criterion='gini', max_depth=2,
                 min_samples_leaf=1, min_samples_split=2)

        # Conduct some model evaluation:
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) # 75% training and 25% test

        pipe = [('estimator', clf)]
        model = Pipeline(pipe)

        model.fit(x, y)

        trained_model = model.named_steps['estimator']
        y_pred = trained_model.predict(x_test)

        # Does not work for small splits:
        #scores = cross_val_score(model, x, y, cv=5)

        all_types_rule[region] = "DecisionTree"
        all_rules_json[region] = tree_to_data(trained_model, feature_names, name_swap, y)
        all_least_json[region] = -1
        all_timed_json[region] = True
        all_sizes_data[region] = str(x.shape)
        all_skl_models[region] = trained_model
        
        #print("model[\"" + str(region) + "\"].x_shape" + "%-12s" % str(x.shape) \
        #        + ".y_shape" + "%-12s" % str(y.shape) \
        #        + "%22s" % ("Acc%: " + "%6s" % ("%3.2f" % (100.0 * metrics.accuracy_score(y_test, y_pred)))))

        if one_big_tree:
            #print("")
            #print(tree_to_simple_str(trained_model, feature_names, name_swap, y))
            #print("")
            break

    #
    # Now we're done building models.
    #
    if one_big_tree == False:
        model_def = {
                "guid": 0,
                "driver": {
                    "rules": all_rules_json,
                    "least": all_least_json,
                    "timed": all_timed_json,
                    },
                "region_names": list(region_names),
                "region_sizes": all_sizes_data,
                "region_types": all_types_rule,
                "features": {
                    "count": len(feature_names),
                    "names": feature_names,
                    },
                }
    else:
        model_def = {
                "guid": 0,
                "driver": {
                    "rules": all_rules_json,
                    "least": all_least_json,
                    "timed": all_timed_json,
                    },
                "region_names": "__ANY_REGION__",
                "region_sizes": all_sizes_data,
                "region_types": all_types_rule,
                "features": {
                    "count": len(feature_names),
                    "names": feature_names,
                    },
                }

    # Add in a default model (Static, OMP defaults) for any unnamed region:
    if one_big_tree == False:
        model_def["region_names"].append("__ANY_REGION__")
        model_def["region_sizes"]["__ANY_REGION__"] = "(0, 0)"
        model_def["region_types"]["__ANY_REGION__"] = "Static"
        model_def["driver"]["rules"]["__ANY_REGION__"] = "0"
        model_def["driver"]["least"]["__ANY_REGION__"] = "-1"
        model_def["driver"]["timed"]["__ANY_REGION__"] = True

    #model_as_json = json.dumps(model_def, sort_keys=False, indent=4, ensure_ascii=True) + "\n"
    
    return all_skl_models


def hide_traceback(exc_tuple=None, filename=None, tb_offset=None,
                   exception_only=False, running_compiled_code=False):
    import sys
    ipython = get_ipython()
    etype, value, tb = sys.exc_info()
    return ipython._showtraceback(etype, value, ipython.InteractiveTB.get_exception_only(etype, value))

def format_bytes(size):
    # 2**10 = 1024
    power = 2**10
    n = 0
    power_labels = {0 : '', 1: 'k', 2: 'M', 3: 'G', 4: 'T'}
    while size > power:
        size /= power
        n += 1
    return str("%3.3f " % size + power_labels[n] + 'B')

def load_csv_data(data):
    def load_and_report(data, dfkey, csvfile):
        data[dfkey] = pd.read_csv(data['path'] + '/' + csvfile)
        dfbytes = data[dfkey].memory_usage(index=False, deep=True).sum()
        print("       data[%s]   %10s   %s" % (dfkey, format_bytes(dfbytes), csvfile))
        return data
    ###
    print("Data source:\n\t%s\n" % data['path'])
    print("Loading:")
    data = load_and_report(data, 'apollo.trace', data['apollo.tracefile'])
    data = load_and_report(data, 'apollo.flush', data['apollo.flushfile'])
    data = load_and_report(data, 'apollo.steps', data['apollo.stepsfile'])
    data = load_and_report(data, 'normal.steps', data['normal.stepsfile'])
    print("")
    return data

#
# To suppress traceback output on errors, add this to main:
#
#ipython = get_ipython()
#ipython.showtraceback = hide_traceback
#


                \     _ \   _ \  |     |      _ \  
               _ \   |   | |   | |     |     |   | 
              ___ \  ___/  |   | |     |     |   | 
            _/    _\_|    \___/ _____|_____|\___/  
                                                   
           -  -  -  --  --  ---  --= --== ==*# ###>
      
