In [1]:
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT
from modt._initialization import *
from modt.visualization import *
from modt.utility import *

import pickle
from timeit import default_timer as timer

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import normalize
from sklearn.model_selection import RepeatedKFold

In [2]:
datasets = [
    ["abalone_input.pd","abalone_target.pd"], 
    ["adult_input.pd","adult_target.pd"], # Large
    ["banknote_input.pd","banknote_target.pd"], # Easy
    ["bank_input.pd","bank_target.pd"], # Large
    ["breast_cancer_input.np","breast_cancer_target.np"],
    ["cars_input.pd","cars_target.pd"], 
    ["contraceptive_input.pd","contraceptive_target.pd"], 
    ["generated6_input.np","generated6_target.np"],
    ["hrss_input.pd","hrss_target.pd"], # Large
    ["iris_input.pd","iris_target.pd"],
    ["steel_input.pd","steel_target.pd"],
    ["students_input.pd","students_target.pd"],
    #["sensorless_input.pd","sensorless_target.pd"], # Very Large dataset
]

In [3]:
gate_reduction_methods = [
    "feature_importance",
    "feature_importance_lda",
    "feature_importance_lda_max",
    "feature_importance_lr",
    "feature_importance_lr_max",
    "feature_importance_xgb",
    "feature_importance_pca_loadings",
    "PCA",
    None
]

In [4]:
parameters = {
    "X": None,
    "y": None,
    "n_experts": 3,
    "iterations": 100,
    "max_depth": 1,
    "init_learning_rate": 110,
    "learning_rate_decay": 0.985,
    "initialization_method": "random",
    "feature_names": None,
    "class_names": None,
    "use_2_dim_gate_based_on": "overwritten",
    "use_2_dim_clustering": False,
    "black_box_algorithm": None,
    }
parameters_fit = {
    "optimization_method": "least_squares_linear_regression",
    "early_stopping": True,
    "use_posterior": False,
    }

In [5]:
start = timer()
runs = 1
rows = []
for dataset in datasets:
    print("Starting",dataset[0],"...")
    data_input = pickle.load(open("../datasets/" + dataset[0], "rb"))
    data_target = pickle.load(open("../datasets/" + dataset[1], "rb"))
    
    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True
        
      
    dimensionality_reduction = gate_reduction_methods
    dict_results = {
        "dataset" : dataset[0],
        "n_features" : data_input.shape[1]
    } 
    
    for method in dimensionality_reduction:
        print("Starting",method,"...")
        parameters["use_2_dim_gate_based_on"] = method

        train_accuracies = []
        val_accuracies = []
        rkf = RepeatedKFold(n_splits=4, n_repeats=runs)
        for train_idx, val_idx in rkf.split(data_input):
            if use_dataframe:
                X_temp = data_input.iloc[train_idx]
                y_temp = data_target.iloc[train_idx]
                X_temp.reset_index(inplace=True, drop=True)
                y_temp.reset_index(inplace=True, drop=True)
            else:
                X_temp = data_input[train_idx]
                y_temp = data_target[train_idx]

            parameters["X"] = X_temp
            parameters["y"] = y_temp
            parameters["initialization_method"] = Random_init(seed=None)
            modt = MoDT(**parameters)
            modt.fit(**parameters_fit)
            train_accuracies.append(modt.score_internal_disjoint())

            if use_dataframe:
                X_temp = data_input.iloc[val_idx]
                y_temp = data_target.iloc[val_idx]
                X_temp.reset_index(inplace=True, drop=True)
                y_temp.reset_index(inplace=True, drop=True)
            else:
                X_temp = data_input[val_idx]
                y_temp = data_target[val_idx]
            val_accuracies.append(modt.score(X_temp, y_temp))

        train_accuracy = np.mean(train_accuracies)
        val_accuracy = np.mean(val_accuracies)
        train_std = np.std(train_accuracies)
        val_std = np.std(val_accuracies)
        dict_results[str(method) + "_train"] = train_accuracy
        dict_results[str(method) + "_test"] = val_accuracy
        dict_results[str(method) + "_train_std"] = train_std
        dict_results[str(method) + "_test_std"] = val_std
        
    rows.append(dict_results)
    
print("Duration", timer() - start)
df_performance = pd.DataFrame(rows)

Starting abalone_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting adult_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting banknote_input.pd ...
Starting feature_importance ...
Starting feature_importance_lda ...
Starting feature_importance_lda_max ...
Starting feature_importance_lr ...
Starting feature_importance_lr_max ...
Starting feature_importance_xgb ...
Starting feature_importance_pca_loadings ...
Starting PCA ...
Starting None ...
Starting bank_inp

In [6]:
df_performance

Unnamed: 0,dataset,n_features,feature_importance_train,feature_importance_test,feature_importance_train_std,feature_importance_test_std,feature_importance_lda_train,feature_importance_lda_test,feature_importance_lda_train_std,feature_importance_lda_test_std,...,feature_importance_pca_loadings_train_std,feature_importance_pca_loadings_test_std,PCA_train,PCA_test,PCA_train_std,PCA_test_std,None_train,None_test,None_train_std,None_test_std
0,abalone_input.pd,8,0.714867,0.700018,0.014226,0.01846,0.735137,0.72444,0.003488,0.013203,...,0.006819,0.01083,0.684621,0.67537,0.006333,0.017711,0.744075,0.728513,0.012101,0.008293
1,adult_input.pd,14,0.784033,0.783072,0.031897,0.028268,0.83246,0.821663,0.003727,0.017976,...,0.019347,0.020775,0.827786,0.812346,0.010425,0.010567,0.852806,0.787381,0.005065,0.051489
2,banknote_input.pd,4,0.949223,0.937318,0.026909,0.023908,0.998299,0.996356,0.000421,0.002417,...,0.004152,0.012196,0.955053,0.94898,0.015885,0.01041,0.999757,0.998542,0.000421,0.001458
3,bank_input.pd,20,0.904819,0.903006,0.002668,0.004091,0.897284,0.896305,0.005371,0.008244,...,0.001099,0.003576,0.894589,0.891498,0.006976,0.006387,0.913397,0.890429,0.002121,0.024863
4,breast_cancer_input.np,10,0.961926,0.922683,0.005561,0.008507,0.932046,0.899808,0.008595,0.03246,...,0.006888,0.020268,0.941411,0.91734,0.007285,0.020896,0.974222,0.922695,0.008609,0.013006
5,cars_input.pd,6,0.786651,0.778935,0.011054,0.022828,0.777778,0.777778,0.003038,0.009113,...,0.027446,0.029051,0.760031,0.760995,0.019127,0.032174,0.914352,0.902778,0.003407,0.010481
6,contraceptive_input.pd,9,0.56687,0.564823,0.005242,0.017989,0.570265,0.549916,0.007026,0.021532,...,0.005454,0.017406,0.507584,0.476578,0.01491,0.016001,0.56937,0.50238,0.038616,0.018237
7,generated6_input.np,2,0.8292,0.819,0.060382,0.062835,0.8466,0.8394,0.048763,0.0508,...,0.08047,0.075388,0.8222,0.8174,0.024225,0.027372,0.907733,0.9082,0.040524,0.03303
8,hrss_input.pd,18,0.764108,0.763629,0.002205,0.002812,0.764291,0.764516,0.00334,0.007672,...,0.002694,0.005028,0.765334,0.76532,0.001983,0.002497,0.767914,0.767012,0.001801,0.002934
9,iris_input.pd,4,0.986666,0.946657,0.004445,0.038229,0.984454,0.959993,0.013085,0.040014,...,0.007445,0.03209,0.973313,0.940078,0.00637,0.011,0.993323,0.953236,0.003855,0.022471


In [7]:
pickle.dump(df_performance, open("dataframes/ex4b_df_selection_performances.pd", "wb"))