In [1]:
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT
from modt._initialization import *
from modt.visualization import *
from modt.utility import *

import pickle
from timeit import default_timer as timer

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import normalize
from sklearn.model_selection import RepeatedKFold

In [2]:
datasets = [
    ["breast_cancer_input.np","breast_cancer_target.np"],
    ["iris_input.pd","iris_target.pd"],
    ["steel_input.pd","steel_target.pd"],
    ["abalone_input.pd","abalone_target.pd"], 
    ["contraceptive_input.pd","contraceptive_target.pd"], 
    ["cars_input.pd","cars_target.pd"], 
    ["students_input.pd","students_target.pd"],
    ["adult_input.pd","adult_target.pd"], # Large
    ["bank_input.pd","bank_target.pd"], # Large
    ["hrss_input.pd","hrss_target.pd"], # Large
    ["occupancy_input.pd","occupancy_target.pd"], # Easy
    ["pdm6_input.pd","pdm6_target.pd"], # Easy
    ["banknote_input.pd","banknote_target.pd"], # Easy
    #["sensorless_input.pd","sensorless_target.pd"], # Very Large dataset
]

In [3]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 3,
    "iterations": 100,
    "max_depth": 1,
    "init_learning_rate": 50,
    "learning_rate_decay": 0.995,
    "initialization_method": "to be overwritten",
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": "to be overwritten",
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }
parameters_fit = {
    "optimization_method": "least_squares_linear_regression",
    "early_stopping": False,
    "use_posterior": False,
    }

In [4]:
gate_reduction_methods = [
    "feature_importance_lr_max",
    None
]

In [5]:
initialization_methods = [
    Random_init(seed=None),
    Kmeans_init(),
    KDTmeans_init(alpha=2.5, beta=0.25, gamma=0.3),
    BGM_init(n_components=parameters["n_experts"],
             mean_precision_prior=0.35,
             weight_concentration_prior_type="dirichlet_process",
             weight_concentration_prior=0.5,
             weight_cutoff=0.0)    
]

In [6]:
start = timer()
runs = 1
rows = []
for dataset in datasets:
    print("Starting",dataset[0],"...")
    data_input = pickle.load(open("../datasets/" + dataset[0], "rb"))
    data_target = pickle.load(open("../datasets/" + dataset[1], "rb"))
    
    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True
        
    dict_results = {
        "dataset" : dataset[0],
        "n_features" : data_input.shape[1]
    } 
    
    for method in gate_reduction_methods:
        for init_method in initialization_methods:
            init_method_name = init_method.__class__.__name__
            
            print("Starting",dataset[0],method,init_method_name,"...")
            parameters["use_2_dim_gate_based_on"] = method
            if parameters["use_2_dim_gate_based_on"] is not None:
                parameters["use_2_dim_clustering"] = True
            else:
                parameters["use_2_dim_clustering"] = False
            parameters["initialization_method"] = init_method

            train_accuracies = []
            val_accuracies = []
            train_accuracies_i = [[] for i in range(parameters["iterations"])]
            
            rkf = RepeatedKFold(n_splits=4, n_repeats=runs)
            for train_idx, val_idx in rkf.split(data_input):
                if use_dataframe:
                    X_temp = data_input.iloc[train_idx]
                    y_temp = data_target.iloc[train_idx]
                    X_temp.reset_index(inplace=True, drop=True)
                    y_temp.reset_index(inplace=True, drop=True)
                else:
                    X_temp = data_input[train_idx]
                    y_temp = data_target[train_idx]

                parameters["X"] = X_temp
                parameters["y"] = y_temp

                modt = MoDT(**parameters)
                modt.fit(**parameters_fit)
                train_accuracies.append(modt.score_internal_disjoint())

                if use_dataframe:
                    X_temp = data_input.iloc[val_idx]
                    y_temp = data_target.iloc[val_idx]
                    X_temp.reset_index(inplace=True, drop=True)
                    y_temp.reset_index(inplace=True, drop=True)
                else:
                    X_temp = data_input[val_idx]
                    y_temp = data_target[val_idx]
                val_accuracies.append(modt.score(X_temp, y_temp))
                for i in range(parameters["iterations"]):
                    train_accuracies_i[i].append(modt.all_accuracies[i])

            train_accuracy = np.mean(train_accuracies)
            val_accuracy = np.mean(val_accuracies)
            train_std = np.std(train_accuracies)
            val_std = np.std(val_accuracies)
            dict_results[str(method) + "_" + str(init_method_name) + "_train"] = train_accuracy
            dict_results[str(method) + "_" + str(init_method_name) + "_test"] = val_accuracy
            dict_results[str(method) + "_" + str(init_method_name) + "_train_std"] = train_std
            dict_results[str(method) + "_" + str(init_method_name) + "_test_std"] = val_std                        

            for i in range(parameters["iterations"]):
                train_i_acc = np.mean(train_accuracies_i[i])
                dict_results[str(method) + "_" + str(init_method_name) + "_train_i_" + str(i)] = modt.all_accuracies[i]
        
    rows.append(dict_results)
    
print("Duration", timer() - start)
df_performance = pd.DataFrame(rows)

Starting breast_cancer_input.np ...
Starting breast_cancer_input.np feature_importance_lr_max Random_init ...
Starting breast_cancer_input.np feature_importance_lr_max Kmeans_init ...
Starting breast_cancer_input.np feature_importance_lr_max KDTmeans_init ...
Starting breast_cancer_input.np feature_importance_lr_max BGM_init ...
Starting breast_cancer_input.np None Random_init ...
Starting breast_cancer_input.np None Kmeans_init ...
Starting breast_cancer_input.np None KDTmeans_init ...
Starting breast_cancer_input.np None BGM_init ...
Starting iris_input.pd ...
Starting iris_input.pd feature_importance_lr_max Random_init ...
Starting iris_input.pd feature_importance_lr_max Kmeans_init ...
Starting iris_input.pd feature_importance_lr_max KDTmeans_init ...
Starting iris_input.pd feature_importance_lr_max BGM_init ...
Separation unsuccessful. Gate initialized randomly.
Starting iris_input.pd None Random_init ...
Starting iris_input.pd None Kmeans_init ...
Starting iris_input.pd None KDTm

  kmeans = KMeans(n_clusters=self_modt.n_experts).fit(X)
  kmeans = KMeans(n_clusters=self_modt.n_experts).fit(X)


Separation unsuccessful. Gate initialized randomly.


  kmeans = KMeans(n_clusters=self_modt.n_experts).fit(X)


Separation unsuccessful. Gate initialized randomly.


  kmeans = KMeans(n_clusters=self_modt.n_experts).fit(X)


Separation unsuccessful. Gate initialized randomly.
Starting cars_input.pd feature_importance_lr_max KDTmeans_init ...


  kmeans = KMeans(n_clusters=self_modt.n_experts).fit(X)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rco

Separation unsuccessful. Gate initialized randomly.


  kmeans = KMeans(n_clusters=self_modt.n_experts).fit(X)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rco

Separation unsuccessful. Gate initialized randomly.


  kmeans = KMeans(n_clusters=self_modt.n_experts).fit(X)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rco

Separation unsuccessful. Gate initialized randomly.


  kmeans = KMeans(n_clusters=self_modt.n_experts).fit(X)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  ret, rco

Separation unsuccessful. Gate initialized randomly.
Starting cars_input.pd feature_importance_lr_max BGM_init ...
Separation unsuccessful. Gate initialized randomly.
Separation unsuccessful. Gate initialized randomly.
Separation unsuccessful. Gate initialized randomly.
Separation unsuccessful. Gate initialized randomly.
Starting cars_input.pd None Random_init ...
Starting cars_input.pd None Kmeans_init ...
Starting cars_input.pd None KDTmeans_init ...
Starting cars_input.pd None BGM_init ...
Starting students_input.pd ...
Starting students_input.pd feature_importance_lr_max Random_init ...
Starting students_input.pd feature_importance_lr_max Kmeans_init ...
Starting students_input.pd feature_importance_lr_max KDTmeans_init ...
Starting students_input.pd feature_importance_lr_max BGM_init ...
Separation unsuccessful. Gate initialized randomly.
Separation unsuccessful. Gate initialized randomly.
Separation unsuccessful. Gate initialized randomly.
Separation unsuccessful. Gate initialized



Starting pdm6_input.pd None Random_init ...
Starting pdm6_input.pd None Kmeans_init ...
Starting pdm6_input.pd None KDTmeans_init ...
Starting pdm6_input.pd None BGM_init ...
Starting banknote_input.pd ...
Starting banknote_input.pd feature_importance_lr_max Random_init ...
Starting banknote_input.pd feature_importance_lr_max Kmeans_init ...
Starting banknote_input.pd feature_importance_lr_max KDTmeans_init ...
Starting banknote_input.pd feature_importance_lr_max BGM_init ...
Starting banknote_input.pd None Random_init ...
Starting banknote_input.pd None Kmeans_init ...
Starting banknote_input.pd None KDTmeans_init ...
Starting banknote_input.pd None BGM_init ...
Duration 2379.5506726


In [7]:
df_performance

Unnamed: 0,dataset,n_features,feature_importance_lr_max_Random_init_train,feature_importance_lr_max_Random_init_test,feature_importance_lr_max_Random_init_train_std,feature_importance_lr_max_Random_init_test_std,feature_importance_lr_max_Random_init_train_i_0,feature_importance_lr_max_Random_init_train_i_1,feature_importance_lr_max_Random_init_train_i_2,feature_importance_lr_max_Random_init_train_i_3,...,None_BGM_init_train_i_90,None_BGM_init_train_i_91,None_BGM_init_train_i_92,None_BGM_init_train_i_93,None_BGM_init_train_i_94,None_BGM_init_train_i_95,None_BGM_init_train_i_96,None_BGM_init_train_i_97,None_BGM_init_train_i_98,None_BGM_init_train_i_99
0,breast_cancer_input.np,10,0.957825,0.933283,0.004939,0.022229,0.915691,0.920375,0.920375,0.920375,...,0.957845,0.957845,0.957845,0.957845,0.957845,0.957845,0.957845,0.957845,0.957845,0.957845
1,iris_input.pd,4,0.977738,0.9399,0.013426,0.06102,0.955752,0.955752,0.955752,0.955752,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,steel_input.pd,27,0.57393,0.553308,0.024238,0.027048,0.497253,0.528159,0.532967,0.534341,...,0.635989,0.635989,0.635989,0.635989,0.635989,0.635989,0.635989,0.635989,0.635989,0.635989
3,abalone_input.pd,8,0.736174,0.729236,0.004882,0.014387,0.667092,0.667092,0.667092,0.668369,...,0.747526,0.747526,0.747526,0.747526,0.747526,0.747846,0.747526,0.747526,0.747526,0.747846
4,contraceptive_input.pd,9,0.538818,0.533583,0.033227,0.032696,0.429864,0.497738,0.493213,0.498643,...,0.424434,0.424434,0.424434,0.424434,0.424434,0.424434,0.424434,0.424434,0.424434,0.424434
5,cars_input.pd,6,0.777778,0.777778,0.001444,0.004331,0.698302,0.698302,0.777778,0.777778,...,0.861111,0.858025,0.857253,0.853395,0.86034,0.854167,0.858025,0.857253,0.850309,0.858796
6,students_input.pd,11,0.515993,0.48035,0.024939,0.053674,0.466,0.51,0.51,0.51,...,0.556,0.562,0.55,0.56,0.566,0.558,0.554,0.554,0.562,0.562
7,adult_input.pd,14,0.820249,0.820502,0.016055,0.010974,0.761957,0.799045,0.810627,0.816418,...,0.762178,0.762178,0.762178,0.762178,0.762178,0.762178,0.762178,0.762178,0.762178,0.762178
8,bank_input.pd,20,0.894986,0.894411,0.002009,0.002121,0.886731,0.886731,0.886731,0.886731,...,0.902237,0.902269,0.902237,0.902237,0.902237,0.902269,0.902237,0.902237,0.902237,0.902269
9,hrss_input.pd,18,0.763586,0.763756,0.001017,0.002734,0.760629,0.760629,0.760629,0.760629,...,0.76514,0.76514,0.76514,0.76514,0.76514,0.76514,0.76514,0.76514,0.76514,0.76514


In [8]:
pickle.dump(df_performance, open("dataframes/df_initialization_methods.pd", "wb"))