In [1]:
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT
from modt._initialization import *
from modt.visualization import *
from modt.utility import *

import pickle
from timeit import default_timer as timer

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import normalize
from sklearn.model_selection import RepeatedKFold

In [2]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 2,
    "iterations": 1,
    "max_depth": 2,
    "init_learning_rate": 100,
    "learning_rate_decay": 1,
    "initialization_method": None,
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": None,
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }

In [3]:
datasets = [
    ["breast_cancer_input.np","breast_cancer_target.np"],
    ["iris_input.pd","iris_target.pd"],
    ["steel_input.pd","steel_target.pd"],
    ["abalone_input.pd","abalone_target.pd"], 
    ["contraceptive_input.pd","contraceptive_target.pd"], 
    ["cars_input.pd","cars_target.pd"], 
    ["students_input.pd","students_target.pd"],
    ["adult_input.pd","adult_target.pd"], # Large
    ["bank_input.pd","bank_target.pd"], # Large
    ["hrss_input.pd","hrss_target.pd"], # Large
    ["occupancy_input.pd","occupancy_target.pd"], # Easy
    ["pdm6_input.pd","pdm6_target.pd"], # Easy
    ["banknote_input.pd","banknote_target.pd"], # Easy
    #["sensorless_input.pd","sensorless_target.pd"], # Very Large dataset
]

In [4]:
gate_reduction_methods = [
    "feature_importance",
    "feature_importance_lda",
    "feature_importance_lda_max",
    "feature_importance_lr",
    "feature_importance_lr_max",
    "feature_importance_xgb",
    "feature_importance_pca_loadings",
    "PCA",
    None
]

In [5]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 3,
    "iterations": 100,
    "max_depth": 2,
    "init_learning_rate": 100,
    "learning_rate_decay": 0.995,
    "initialization_method": "random",
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": None,
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }
parameters_fit = {
    "optimization_method": "least_squares_linear_regression",
    "add_noise": False,
    "use_posterior": False,
    }

In [6]:
start = timer()
runs = 5
rows = []
for dataset in datasets:
    print("Starting",dataset[0],"...")
    data_input = pickle.load(open("../datasets/" + dataset[0], "rb"))
    data_target = pickle.load(open("../datasets/" + dataset[1], "rb"))
    
    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True
        
      
    dimensionality_reduction = gate_reduction_methods
    dict_results = {
        "dataset" : dataset[0],
        "n_features" : data_input.shape[1]
    } 
    
    for method in dimensionality_reduction:
        print("Starting",method,"...")
        parameters["use_2_dim_gate_based_on"] = method

        train_accuracies = []
        val_accuracies = []
        rkf = RepeatedKFold(n_splits=5, n_repeats=runs)
        for train_idx, val_idx in rkf.split(data_input):
            if use_dataframe:
                X_temp = data_input.iloc[train_idx]
                y_temp = data_target.iloc[train_idx]
                X_temp.reset_index(inplace=True, drop=True)
                y_temp.reset_index(inplace=True, drop=True)
            else:
                X_temp = data_input[train_idx]
                y_temp = data_target[train_idx]

            parameters["X"] = X_temp
            parameters["y"] = y_temp
            parameters["initialization_method"] = Random_init(seed=None)
            modt = MoDT(**parameters)
            modt.fit(**parameters_fit)
            train_accuracies.append(modt.score_internal_disjoint())

            if use_dataframe:
                X_temp = data_input.iloc[val_idx]
                y_temp = data_target.iloc[val_idx]
                X_temp.reset_index(inplace=True, drop=True)
                y_temp.reset_index(inplace=True, drop=True)
            else:
                X_temp = data_input[val_idx]
                y_temp = data_target[val_idx]
            val_accuracies.append(modt.score(X_temp, y_temp))

        train_accuracy = np.mean(train_accuracies)
        val_accuracy = np.mean(val_accuracies)
        train_std = np.std(train_accuracies)
        val_std = np.std(val_accuracies)
        dict_results[str(method) + "_train"] = train_accuracy
        dict_results[str(method) + "_test"] = val_accuracy
        dict_results[str(method) + "_train_std"] = train_std
        dict_results[str(method) + "_test_std"] = val_std
        
    rows.append(dict_results)
    
print("Duration", timer() - start)
df_performance = pd.DataFrame(rows)

Starting breast_cancer_input.np ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting iris_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting steel_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting abalone_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting contraceptive_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting cars_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting students_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting adult_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...




































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting bank_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...




































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting hrss_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...




































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting occupancy_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting pdm6_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting banknote_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
































































































Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Duration 19131.908994999998


In [7]:
df_performance

Unnamed: 0,dataset,n_features,feature_importance_train,feature_importance_test,feature_importance_train_std,feature_importance_test_std,feature_importance_lda_train,feature_importance_lda_test,feature_importance_lda_train_std,feature_importance_lda_test_std,...,feature_importance_pca_loadings_train_std,feature_importance_pca_loadings_test_std,PCA_train,PCA_test,PCA_train_std,PCA_test_std,None_train,None_test,None_train_std,None_test_std
0,breast_cancer_input.np,10,0.96863,0.923372,0.006036,0.019843,0.961509,0.911405,0.007899,0.025404,...,0.006086,0.018309,0.960369,0.915616,0.006352,0.024205,0.984797,0.924782,0.003287,0.026429
1,iris_input.pd,4,0.989,0.954667,0.007717,0.039866,0.99,0.953333,0.007071,0.037712,...,0.009638,0.033307,0.989,0.94,0.006549,0.045216,0.996,0.948,0.004163,0.041183
2,steel_input.pd,27,0.65041,0.625977,0.022587,0.02608,0.614323,0.588662,0.014126,0.022501,...,0.020422,0.036981,0.629235,0.60711,0.0209,0.020661,0.718959,0.669451,0.019072,0.02862
3,abalone_input.pd,8,0.729399,0.714771,0.008493,0.01937,0.736234,0.725261,0.005833,0.014677,...,0.008828,0.014018,0.705004,0.6836,0.006808,0.015481,0.754812,0.736705,0.005301,0.0132
4,contraceptive_input.pd,9,0.588256,0.55912,0.010739,0.030638,0.588731,0.553973,0.009665,0.030842,...,0.018568,0.029537,0.544433,0.512969,0.012233,0.022512,0.585779,0.512692,0.011777,0.024392
5,cars_input.pd,6,0.807753,0.79954,0.010381,0.020234,0.812762,0.798964,0.010612,0.018924,...,0.015722,0.028717,0.806451,0.797108,0.016857,0.029606,0.906569,0.889819,0.018991,0.020058
6,students_input.pd,11,0.526726,0.44294,0.015762,0.044709,0.542864,0.470214,0.016213,0.053004,...,0.014247,0.049396,0.521398,0.459495,0.012225,0.038333,0.616966,0.4085,0.026309,0.057937
7,adult_input.pd,14,0.832427,0.830854,0.007232,0.012971,0.838988,0.836152,0.002242,0.010774,...,0.007203,0.022606,0.841751,0.829912,0.002819,0.01331,0.850786,0.785212,0.002895,0.033562
8,bank_input.pd,20,0.908623,0.905618,0.001798,0.005544,0.909147,0.905317,0.001873,0.010726,...,0.001518,0.003303,0.907231,0.904953,0.000725,0.002733,0.912359,0.895946,0.001736,0.019571
9,hrss_input.pd,18,0.770057,0.768974,0.00286,0.004254,0.769736,0.76938,0.003011,0.006642,...,0.002579,0.006561,0.77096,0.770133,0.001901,0.006597,0.776041,0.774794,0.004632,0.005286


In [8]:
pickle.dump(df_performance, open("dataframes/df_fi_performance.pd", "wb"))