In [4]:
import numpy as np
import pandas as pd
import os
import time
import collections
import itertools as it
from scipy import stats

from sklearn import cross_validation
from sklearn import neighbors, preprocessing, decomposition, ensemble, metrics, svm, gaussian_process
from sklearn import naive_bayes, linear_model, neural_network, model_selection
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV



def run_pre_processing(X_train, X_test, preprocess_type = 'std'):
    
    '''
    Transform data using the given pre-processing method
    '''
       
    if preprocess_type == 'std':  
        scaler_train = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler_train.transform(X_train)
        X_test = scaler_train.transform(X_test)
        
    elif preprocess_type == 'pca':
        pca_train = decomposition.PCA().fit(X_train)
        X_train = pca_train.transform(X_train)
        X_test = pca_train.transform(X_test)
        
    elif preprocess_type == 'raw':
        print("Dataset is already raw.")
    
    else:    
        print("The preprocessing type is unknown.")
        
    return [X_train, X_test]



def perform_feature_selection_across_folds(X, Y, cv, clf, localmin_selection = False, verbose = False):
    
    '''
    Compute incremental forward feature selection
    '''

    # initialize helper and storage variables
    max_features = X.shape[1] 
    last_feature_col = X.shape[1]
    best_features = np.zeros([max_features], dtype=np.int)
    best_errors = np.zeros([max_features])
    best_features[:] = -1 # Set all values to -1 to help the skipping of iterations
    
    for i in range(max_features):
        
        if verbose:
            print("%i" % i + " out of %i features" % max_features + " are now chosen")

        tmp_error = 1
    
        for feature_i in range(last_feature_col):
            if verbose:
                print("this feature is: %i" % feature_i)

            if feature_i in best_features: # the current feature is already chosen in earlier stage
                if verbose:
                    print("Skipping this feature as it is already selected previously")
                continue # skip this iteration    

            # pick out a feature and add it (temporarily) to best_features.    
            best_features[i] = feature_i
            current_features = best_features[:i+1] 

            if verbose:
                print("current_features are: ",current_features)

            error_rate = np.zeros(len(cv))
            
            # loop through inner folds and compute mean performance across folds directly
            # for each set of features evaluated by the incremental process
            for counter, (train_index, test_index) in enumerate(cv):
                
                if verbose:
                    print("Inner fold inside feature selection: ", counter)
                    
                # reduce the data to match the current inner fold folds
                data_train_X = X[train_index, :]
                data_test_X = X[test_index, :]
                Xtrain, Xtest = data_train_X[:, current_features], data_test_X[:, current_features]
                ytrain, ytest = Y[train_index], Y[test_index]

                if verbose:
                    print("Loaded training data: n=%i, d=%i" % (Xtrain.shape[0], Xtrain.shape[1]))
                    print("Fitting model ...")

                # setup model and fit on training data    
                model = clf
                model.fit(Xtrain, ytrain)
                
                if verbose:
                    print("Model fitted!")

                if verbose:
                    print("Loaded testing data: n=%i, d=%i" % (Xtest.shape[0], Xtest.shape[1]))
                    print("Applying model ...")

                # compute predictions and return error    
                preds = model.predict(Xtest)
                error_rate[counter] = 1 - metrics.accuracy_score(ytest,preds)
            
            # compute and store mean across inner folds
            error_mean = np.mean(error_rate)   

            if verbose:
                print("Misclassification error across folds: %.4f" % error_mean)
            
            # if the mean error is better than the previously best, the final error is updated
            if tmp_error > error_mean:
                tmp_error = error_mean
                tmp_feature = feature_i
                if verbose:
                    print("New best error mean %.4f" % error_mean)

        # if the incremental forward feature selection is set to stop when we reach a local error minimum            
        if (localmin_selection) & (i>0):
            if best_errors[i-1] < tmp_error:
                best_features = best_features[0:i]
                best_errors = best_errors[0:i]
                
                if verbose:
                    print("\n The localmin selection has found the best features and the corresponding errors \n")
                    print("\n", best_features, "\n")
                    
                return best_features, best_errors[-1]

        # The iterations of all available features are done and so we save the best feature.
        best_features[i] = tmp_feature
        best_errors[i] = tmp_error
        
    if verbose:
        print("\nFinal features and errors without localmin", best_features, best_errors, "\n")    
        
    return best_features, best_errors[-1]



def build_combinations(param_grid):
    
    '''
    Create all possible hyperparameter combinations for a given model 
    '''
  
    combinations = []

    allNames = sorted(param_grid)
    tmp_combs = list(it.product(*(param_grid[Name] for Name in allNames)))
    
    for combi in tmp_combs:
        my_dict = {}
        my_dict[allNames[0]] = combi[0]
    
        for i in range(1,len(allNames)):
            my_dict[allNames[i]] = combi[i]
        
        combinations.append(my_dict)
        
    return combinations



def feature_and_model_selection_across_folds(X, Y, X_outer_test, Y_outer_test, model_id, 
                                             model_class, param_grid, cv, localmin_selection, 
                                             FS_verbose = False):
    
    '''
    Perform model selection and feature selection simultaneously, i.e. perform feature selection 
    on each model parameter combination and return the best model and the corresponding feature set
    '''
    
    # load the current model class and create all possible hyperparameter combinations    
    grid_search = model_class   
    combinations = build_combinations(param_grid[model_id])
    tmp_error = 1

    for i in range(len(combinations)):

        # load the current parameter value into the model and perform feature selection
        grid_search.set_params(**combinations[i])
        best_features, best_error = perform_feature_selection_across_folds(X, Y, cv, grid_search,
                                                                           localmin_selection, FS_verbose)

        # if the error just found is better than the previously best error, the final error is updated
        if tmp_error > best_error:
            tmp_error = best_error
            tmp_combination = combinations[i]
            tmp_features = best_features

    # returning results after finding the pair of best features and best hyperparameters        
    best_score_ = tmp_error    
    best_params_ = tmp_combination
    best_features_ = tmp_features

    return best_score_, best_params_, best_features_



def ensemble_learner(predictions, ensemble_type = 'class'):
    
    '''
    Convert base learner predictions into ensemble predictions 
    using either hard (class) or soft (probability) voting scheme
    '''
    
    # compute the amount of base learners resulting in tie
    s = predictions.shape[1]*0.5
    
    res = np.zeros(predictions.shape[0])
    
    # majority vote / hard voting
    if ensemble_type == 'class':
        
        # if majority says 1, the ensemble prediction is 1, and vice versa
        tmp = np.count_nonzero(predictions,axis=1)
        res[np.where(tmp > s)] = 1 
        res[np.where(tmp < s)] = 0
        c = len(res[tmp==s])
        
        # if tie, randomly select a class prediction
        res[np.where(tmp==s)] = np.random.randint(0,2,size=c)
        
    # equal-weighted average of probabilities / soft voting    
    elif ensemble_type == 'probability':
        
        # if average is above 50%, the ensemble prediction is 1, and vice versa
        tmp = np.mean(predictions, axis=1)
        res[np.where(tmp > 0.5)] = 1
        res[np.where(tmp < 0.5)] = 0
        c = len(res[res==0.5])
        
        # if tie, randomly select a class prediction
        res[np.where(res==0.5)] = np.random.randint(0,2,size=c)
        
        
    return res




def stacked_ensemble_learner(model_class, param_grid, inner_fold_predictions, inner_fold_labels, 
                             outer_fold_predictions, ensemble_type = 'class', cv = 5, n_jobs = 1):
    
    '''
    Train stacked ensemble learner on base learner predictions from inner folds 
    using class or probability predictions.
    '''    
    
    # setup grid search and fit to inner fold base-learner predictions
    grid_search = GridSearchCV(model_class, param_grid, cv = cv, n_jobs = n_jobs)
    grid_search.fit(inner_fold_predictions, inner_fold_labels)
    
    # predict the outer fold predictions using the trained ensemble
    predictions = grid_search.predict(outer_fold_predictions)  
        
    return predictions



def compute_all(X, Y, preprocess_type = 'raw', feature_selection = True, 
                     selection_type = 'rf', cv_folds = 5, models = [],
                     param_grid = [], verbose=False, localmin_selection = False,
                     FS_verbose = False, outputdir = None, write_output = False, 
                     n_jobs = 1, seed = 0
                    ):
    
    '''
    Build ensembles using base learners while incorporating pre-processing, 
    model selection, and feature selection.
    '''
    
    # ensure expected data type
    X = X.values.astype(np.float32)
    Y = Y.values.astype(np.int)
    
    # nested cross-validation, this is the outer fold initialization
    kf = cross_validation.StratifiedKFold(Y, n_folds=cv_folds, shuffle=True,
                                random_state=seed)    
    
    # store training and test accuracies, best parameters, 
    # selected features, ensemble accuracies
    total_train_acc = np.zeros(shape=(cv_folds, len(models))) 
    total_test_acc = np.zeros(shape=(cv_folds, len(models))) 
    total_best_params = np.zeros(shape=(cv_folds, len(models)),dtype=np.object)
    total_features = np.zeros(shape=(cv_folds, len(models)),dtype=np.object)
    total_ens_test_acc = np.zeros(shape=(cv_folds, 4)) # 4 different ensembles
    
    for counter, (train_index, test_index) in enumerate(kf):
        if verbose:
            print("Outer CV fold {}".format(counter))
        
        # create directory for output, if write_output is active
        if write_output:
            odir = os.path.join(outputdir, str(counter))
            if not os.path.exists(odir):
                os.makedirs(odir)

        # get training/test splits
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        # store predictions and errors within each outer fold
        total_pred_proba = np.zeros(shape=(len(test_index), len(models)))
        total_Y_pred = np.zeros(shape=(len(test_index), len(models)))
        total_Y_error_proba = np.zeros(shape=(len(test_index), len(models)))
        total_Y_error_class = np.zeros(shape=(len(test_index), len(models)))
        
        # store predictions on inner folds for training stacked ensembles
        total_inner_pred = np.zeros(shape=(len(train_index), len(models))) 
        total_inner_pred_proba = np.zeros(shape=(len(train_index), len(models))) 
        
        # initialize inner kf
        inner_kf = cross_validation.StratifiedKFold(Y_train, n_folds=cv_folds,
                                          shuffle=True, random_state = seed)
       
        # perform pre-processing on current outer training and test set
        X_train, X_test = run_pre_processing(X_train, X_test, preprocess_type = preprocess_type)    
        
        if feature_selection:

            if selection_type == "rf":
               
                # identify the best hyperparameters for the random forest feature selection model
                grid_search = GridSearchCV(models['rf'], param_grid['rf'], cv = cv_folds, n_jobs = n_jobs)
                grid_search.fit(X_train,Y_train)
                
                # store the best hyperparameters and initialize a separate random forest with those parameters
                rf_params = grid_search.best_params_
                rf = ensemble.RandomForestClassifier(**rf_params)
                
                rf_features = np.zeros(shape=(cv_folds,X_train.shape[1]))
                
                # refit the random forest using only the correctly accessible training data
                # and return feature importances. This is not the perfect solution but it is 
                # better than extracting feature importances from the grid search above which 
                # is refit on all data (also test set) of the inner folds
                for inner_counter, (train_idx, test_idx) in enumerate(inner_kf):
                    rf.fit(X_train[train_idx, :], Y_train[train_idx])
                    rf_features[inner_counter,:] = rf.feature_importances_
                
                # compute mean of importances across folds and select those above the mean
                rf_features = np.mean(rf_features, axis=0)            
                threshold_value = np.mean(rf_features) 
                rf_features = np.where(rf_features > threshold_value)[0]
                
                if verbose:
                    print("RF Feature selection final best features: " + str(rf_features))
                
                # reduce data sets using selected features
                X_train = X_train[:, rf_features]
                X_test = X_test[:, rf_features]
                
                # store selected features
                total_features[counter, :] = str(rf_features)
                
            elif selection_type == "forward":
                
                if verbose:
                    print("Simultaneous feature selection and model selection starting ...")
                
                # call forward feature selection on only k-NN and then 
                # reduce feature set prior to model selection
                model_id = 'knn'
                best_score_, best_params_, best_features_ = feature_and_model_selection_across_folds(X_train, Y_train, 
                                                                                                     X_test, Y_test, 
                                                                                                     model_id = model_id,
                                                                                                     model_class = models[model_id],
                                                                                                     param_grid = param_grid,
                                                                                                     cv = inner_kf,
                                                                                                     localmin_selection = localmin_selection,
                                                                                                     FS_verbose = FS_verbose)
                
                
                print("KNN Feature selection final best features: " + str(best_features_))
                
                # reduce data sets using selected features
                X_train = X_train[:, best_features_]
                X_test = X_test[:, best_features_]
                
                # store selected features
                total_features[counter, :] = str(best_features_)
                
                
            else: 
                
                print("Feature selection method is not recognized...")
        
        
        # model selection     
        for idx, (model_id, model_class) in enumerate(models.items()):    

            # identify the best hyperparameters for each model
            grid_search = GridSearchCV(model_class, param_grid[model_id], cv = cv_folds, n_jobs = n_jobs)
            grid_search.fit(X_train,Y_train)

            # loop through inner folds manually to train base learner only on the correctly accessible training 
            # data set before prediction on the inner fold test set. Save these predictions and form a full 
            # dataset of predictions from all test observations (every observation is used once). Do this for 
            # all models and feed into the ensemble.
            inner_pred = np.zeros(X_train.shape[0])
            inner_pred_proba = np.zeros(X_train.shape[0])

            for _, (train_index, test_index) in enumerate(inner_kf): 

                clf = model_class.set_params(**grid_search.best_params_)
                clf.fit(X_train[train_index,:], Y_train[train_index])

                inner_pred[test_index] = clf.predict(X_train[test_index,:])
                inner_pred_proba[test_index] = clf.predict_proba(X_train[test_index,:])[:,1] # we only keep probabilities for class=1

            # store predictions on inner test sets for training stacked ensembles   
            total_inner_pred[:, idx] = inner_pred
            total_inner_pred_proba[:, idx] = inner_pred_proba

            # store class predictions and probability predictions for basic/naive ensembles
            # and compute accuracy
            Y_test_pred = grid_search.predict(X_test)
            test_acc = metrics.accuracy_score(Y_test, Y_test_pred)
            pred_proba = grid_search.predict_proba(X_test)[:,1] # we only keep probabilities for class=1

            if verbose:
                print(model_id, "Mean CV accuracy from inner folds on current outer"  
                    "fold:\t{:.4f}".format(grid_search.best_score_))                    
                print(model_id, "test accuracy on current outer"
                    "fold:\t\t\t\t{:.4f}".format(test_acc)) 
                print(model_id, "best parameters on current outer"
                    "fold:\t\t\t{}".format(grid_search.best_params_)) 

            # store results
            total_train_acc[counter, idx] = grid_search.best_score_
            total_test_acc[counter, idx] = test_acc
            total_best_params[counter, idx] = grid_search.best_params_
            total_pred_proba[:, idx] = pred_proba
            total_Y_pred[:,idx] = Y_test_pred

            total_Y_error_proba[:, idx] = pd.DataFrame(pred_proba).apply(lambda pred: 1*(pred > 0.5) - Y_test).values.flatten()
            total_Y_error_class[:, idx] = Y_test_pred - Y_test


        # save results
        if write_output:
            np.savetxt(os.path.join(odir, "total_pred_proba.csv"), total_pred_proba,
                       delimiter=",")
            np.savetxt(os.path.join(odir, "total_Y_pred.csv"), total_Y_pred,
                       delimiter=",")
            np.savetxt(os.path.join(odir, "total_Y_error_proba.csv"), total_Y_error_proba,
                        delimiter=",")
            np.savetxt(os.path.join(odir, "total_Y_error_class.csv"), total_Y_error_class,
                        delimiter=",")
            np.savetxt(os.path.join(odir, "Y_test.csv"), Y_test,
                           delimiter=",")

        # basic/naive ensembles require no training, so we essentially compute predictions directly
        ens_pred_prob = ensemble_learner(total_pred_proba, ensemble_type='probability')  
        ens_pred_class = ensemble_learner(total_Y_pred, ensemble_type='class') 

        # train on (total_inner_pred_proba, Y_train). Predict on total_pred_proba
        lr_ens_pred_prob = stacked_ensemble_learner(models['logr'], param_grid['logr'], total_inner_pred_proba, Y_train,
                                                    total_pred_proba, ensemble_type='probability',
                                                    cv=inner_kf, n_jobs = n_jobs)
        
        lr_ens_pred_class = stacked_ensemble_learner(models['logr'], param_grid['logr'], total_inner_pred, Y_train, 
                                                     total_Y_pred, ensemble_type='class', 
                                                     cv=inner_kf, n_jobs = n_jobs )
        
        if verbose:
            print("ENSEMBLE prob: test accuracy on current outer"
                "fold:\t{:.4f}".format(metrics.accuracy_score(Y_test, ens_pred_prob)))
            print("ENSEMBLE class: test accuracy on current outer"
                "fold:\t{:.4f}".format(metrics.accuracy_score(Y_test, ens_pred_class)))
            print("ENSEMBLE LR prob: test accuracy on current outer"
                "fold:\t{:.4f}".format(metrics.accuracy_score(Y_test, lr_ens_pred_prob)))
            print("ENSEMBLE LR class: test accuracy on current outer"
                "fold:\t{:.4f}".format(metrics.accuracy_score(Y_test, lr_ens_pred_class)))
                   
        # compute test accuracies on the current outer fold test set
        total_ens_test_acc[counter,0] = metrics.accuracy_score(Y_test, ens_pred_prob)
        total_ens_test_acc[counter,1] = metrics.accuracy_score(Y_test, ens_pred_class)
        total_ens_test_acc[counter,2] = metrics.accuracy_score(Y_test, lr_ens_pred_prob)
        total_ens_test_acc[counter,3] = metrics.accuracy_score(Y_test, lr_ens_pred_class)
    
    # compute means and standard errors of all outer fold test accuracies
    mean_train = np.mean(total_train_acc, axis=0)
    std_train = np.std(total_train_acc, axis=0)
    
    mean_test = np.mean(total_test_acc, axis=0)
    std_test = np.std(total_test_acc, axis=0)
    
    mean_test_ens = np.mean(total_ens_test_acc, axis=0)
    std_test_ens = np.std(total_ens_test_acc, axis=0)

    if verbose:
        
        print(total_train_acc)
        print(total_test_acc)
        print(total_ens_test_acc)
        
        for i in range(len(models)):
            print("#####",list(models.keys())[i],"#####")
            
            print(("Average train accuracy on all folds: {mean} +/- {std}\n"
                   ).format(mean = mean_train[i],
                            std = std_train[i]))
            
            print(("Average test accuracy on all folds: {mean} +/- {std}\n"
                   ).format(mean = mean_test[i],
                            std = std_test[i]))    
        print(("\n----- Ensemble Classifier results -----\n"))    
        print(("Average test accuracy for ensemble class classifier on all folds: {mean} +/- {std}\n"
                       ).format(mean = mean_test_ens[1],
                                std = std_test_ens[1]))
        print(("Average test accuracy for ensemble probability classifier on all folds: {mean} +/- {std}\n"
                   ).format(mean = mean_test_ens[0],
                            std = std_test_ens[0]))
        print(("Average test accuracy for LR ensemble class classifier on all folds: {mean} +/- {std}\n"
                       ).format(mean = mean_test_ens[3],
                                std = std_test_ens[3]))
        print(("Average test accuracy for LR ensemble probability classifier on all folds: {mean} +/- {std}\n"
                   ).format(mean = mean_test_ens[2],
                            std = std_test_ens[2]))
    
    # write to file - both .txt for overview and .csv for earlier data analysis
    if write_output:
        with open(os.path.join(outputdir, "results.txt"), 'w') as ofile:
            for idx in range(len(models)):
                ofile.write(("\n----- {} -----\n").format(list(models.keys())[idx]))
        
                ofile.write(("Average train accuracy on all folds: {mean} +/- {std}\n"
                       ).format(mean = mean_train[idx],
                                std = std_train[idx]))
            
                ofile.write(("Average test accuracy on all folds: {mean} +/- {std}\n"
                       ).format(mean = mean_test[idx],
                                std = std_test[idx]))   
                
                ofile.write(("Best params on all folds: {}\n"
                       ).format(total_best_params[:,idx]))   
                                        
                if feature_selection: 
                    ofile.write(("Best features on all folds: {}\n"
                       ).format(total_features[:,idx])) 
                    
                else:    
                    ofile.write(("Best features on all folds: NO FEATURE SELECTION \n"))
            
            ofile.write(("\n----- Ensemble Classifier results -----\n"))    
            ofile.write(("Average test accuracy for ensemble class classifier on all folds: {mean} +/- {std}\n"
                       ).format(mean = mean_test_ens[1],
                                std = std_test_ens[1]))
            ofile.write(("Average test accuracy for ensemble probability classifier on all folds: {mean} +/- {std}\n"
                       ).format(mean = mean_test_ens[0],
                                std = std_test_ens[0])) 
            ofile.write(("Average test accuracy for LR ensemble class classifier on all folds: {mean} +/- {std}\n"
                        ).format(mean = mean_test_ens[3],
                                std = std_test_ens[3]))
            ofile.write(("Average test accuracy for LR ensemble probability classifier on all folds: {mean} +/- {std}\n"
                       ).format(mean = mean_test_ens[2],
                                std = std_test_ens[2]))
                
                
                
        # identical fields for the .csv file
        with open(os.path.join(outputdir, "results.csv"), 'w') as ofile:
            for idx in range(len(models)):
                ofile.write(("\n----- {} -----\n").format(list(models.keys())[idx]))
        
                ofile.write(("Average train accuracy on all folds:; {mean} ;+/-; {std}\n"
                               ).format(mean = mean_train[idx],
                                std = std_train[idx]))
            
                ofile.write(("Average test accuracy on all folds:; {mean} ;+/-; {std}\n"
                               ).format(mean = mean_test[idx],
                                std = std_test[idx]))   
                
                ofile.write(("Best params on all folds:; {}\n"
                               ).format(total_best_params[:,idx]))   
                                      
                if feature_selection: 
                    ofile.write(("Best features on all folds:; {}\n"
                                   ).format(total_features[:,idx]))  
                else:    
                    ofile.write(("Best features on all folds: NO FEATURE SELECTION \n"))                    
            
            ofile.write(("\n----- Ensemble Classifier results -----\n"))    
            ofile.write(("Average test accuracy for ensemble class classifier on all folds:; {mean} ;+/-; {std}\n"
                           ).format(mean = mean_test_ens[1],
                                std = std_test_ens[1]))
            ofile.write(("Average test accuracy for ensemble probability classifier on all folds:; {mean} ;+/-; {std}\n"
                           ).format(mean = mean_test_ens[0],
                                std = std_test_ens[0]))    
            ofile.write(("Average test accuracy for LR ensemble class classifier on all folds:; {mean} ;+/-; {std}\n"
                            ).format(mean = mean_test_ens[3],
                                std = std_test_ens[3]))
            ofile.write(("Average test accuracy for LR ensemble probability classifier on all folds:; {mean} ;+/-; {std}\n"
                           ).format(mean = mean_test_ens[2],
                                std = std_test_ens[2]))
            
        # all the essential outer fold test performances gathered in a smaller .csv file    
        with open(os.path.join(outputdir, "result_table.csv"), 'w') as ofile:
            for idx in range(len(models)):
                ofile.write(("{mean};{std}\n").format(mean = mean_test[idx], std = std_test[idx])) 
                
            ofile.write(("{mean};{std}\n").format(mean = mean_test_ens[1], std = std_test_ens[1]))
            ofile.write(("{mean};{std}\n").format(mean = mean_test_ens[0], std = std_test_ens[0]))
            ofile.write(("{mean};{std}\n").format(mean = mean_test_ens[3], std = std_test_ens[3]))
            ofile.write(("{mean};{std}\n").format(mean = mean_test_ens[2], std = std_test_ens[2]))    
            

    return



if __name__ == "__main__":

    models = collections.OrderedDict({'knn': neighbors.KNeighborsClassifier(),
                                     'rf': ensemble.RandomForestClassifier(),
                                     'kersvm': svm.SVC(kernel = 'rbf', probability=True),
                                     'linsvm': svm.SVC(kernel = 'linear', probability=True),
                                     'gpc': gaussian_process.GaussianProcessClassifier(),
                                     'gnb': naive_bayes.GaussianNB(),
                                     'gbm': ensemble.GradientBoostingClassifier(),
                                     'logr': linear_model.LogisticRegression(),
                                     'mlp_nn': neural_network.MLPClassifier(max_iter = 2000)        
                                      })

    param_grid = {'knn': {'n_neighbors': [1,3,5,7,9,11,13,15,17,19]},
                  'rf': {'n_estimators': [50,100,200], 'max_features': ['auto', None], 'min_samples_leaf': [1, 5, 10]}, 
                  'kersvm': {'C': [1/32, 1/8, 1/2, 2, 8, 32], 'gamma':[1/32, 1/8, 1/2, 2, 8, 32]},
                  'linsvm': {'C': [1/32, 1/8, 1/2, 2, 8, 32]},
                  'gpc': {'n_restarts_optimizer': [0]},
                  'gnb': {'priors': [None]},
                  'gbm': {'n_estimators': [100, 300, 500], 'learning_rate': [0.01, 0.1, 0.5], 'max_depth': [3,6]},
                  'logr': {'C': [0.001, 0.1, 1, 10 , 100, 1e10]},
                  'mlp_nn': {'solver': ['lbfgs'], 'hidden_layer_sizes': [(5,), (10,), (20, ), (10, 10,)], 
                             'alpha': [0.001, 0.1, 1, 10]
                            }     
                  }

    settings = {'verbose': True,
                'cv_folds': 5, 
                'localmin_selection': True, 
                'FS_verbose': False, 
                'write_output': True,            
                }


    seed = 0
    np.random.seed(seed)
    
    preprocess_list = ['raw', 'std', 'pca']
    FS_list = [{'feature_selection': False},  
                {'feature_selection': True, 'selection_type': 'rf'},
                {'feature_selection': True, 'selection_type': 'forward'}
               ]
    
    ######## CLIMATE DATA ########
    data_clim = pd.read_csv("Datasets/Climate Model Simulation Crashes/Climate_data.csv", sep=";")
    print("=============== CLIMATE DATA ANALYSIS STARTING ===============")

    timedir = os.path.join("output/data_clim/time", time.strftime("%Y_%m_%d_%H_%M_%S"))
    if not os.path.exists(timedir):
        os.makedirs(timedir)

    for preprocess in preprocess_list:
        for FS in FS_list:
            tmp_FS = ('no_FS' if FS == FS_list[0] else 'FS_rf' if FS == FS_list[1] else 'FS_forward')
            outdir = "output/data_clim/" + preprocess + '-' + tmp_FS
            print(" == data_clim experiment %s, %s ==" % (preprocess, tmp_FS))
            outputdir = os.path.join(outdir, time.strftime("%Y_%m_%d_%H_%M_%S"))

            np.random.seed(seed)

            start = time.time()
            compute_all(data_clim.iloc[:,:-1], data_clim.iloc[:,-1], preprocess_type=preprocess, 
                     models = models, param_grid = param_grid, outputdir = outputdir, seed = seed, **settings, **FS)

            with open(os.path.join(timedir, "time.csv"), 'a') as ofile:
                ofile.write(preprocess + ";" + tmp_FS + ";" + str(time.time()-start) + "\n")


    ######## BREAST CANCER DATA ########
    data_bc = pd.read_csv("Datasets/Breast cancer/Breast_cancer_data.csv", sep=";", header=None)
    data_bc.loc[data_bc[30] == 'M', 30] = 0
    data_bc.loc[data_bc[30] == 'B', 30] = 1
    print("=============== BREAST CANCER DATA ANALYSIS STARTING ===============")


    timedir = os.path.join("output/data_breast/time", time.strftime("%Y_%m_%d_%H_%M_%S"))
    if not os.path.exists(timedir):
        os.makedirs(timedir)

    for preprocess in preprocess_list:
        for FS in FS_list:
            tmp_FS = ('no_FS' if FS == FS_list[0] else 'FS_rf' if FS == FS_list[1] else 'FS_forward')
            outdir = "output/data_breast/" + preprocess + '-' + tmp_FS
            print(" == data_breast experiment %s, %s ==" % (preprocess, tmp_FS))
            outputdir = os.path.join(outdir, time.strftime("%Y_%m_%d_%H_%M_%S"))

            np.random.seed(seed)

            start = time.time()
            compute_all(data_bc.iloc[:,:-1], data_bc.iloc[:,-1], preprocess_type=preprocess, 
                     models = models, param_grid = param_grid, outputdir = outputdir, seed = seed, **settings, **FS)

            with open(os.path.join(timedir, "time.csv"), 'a') as ofile:
                ofile.write(preprocess + ";" + tmp_FS + ";" + str(time.time()-start) + "\n")            



    ######## WINE DATA ########
    data_wine = pd.read_csv("Datasets/Wine Quality/winequality-red.csv", sep=";") # Remember to binarize quality if wanted
    data_wine = data_wine[(data_wine['quality'] == 5) | (data_wine['quality'] == 6)]
    data_wine.loc[data_wine['quality'] == 5, 'quality'] = 0
    data_wine.loc[data_wine['quality'] == 6, 'quality'] = 1
    print("=============== WINE DATA ANALYSIS STARTING ===============")

    timedir = os.path.join("output/data_wine/time", time.strftime("%Y_%m_%d_%H_%M_%S"))
    if not os.path.exists(timedir):
        os.makedirs(timedir)

    for preprocess in preprocess_list:
        for FS in FS_list:
            tmp_FS = ('no_FS' if FS == FS_list[0] else 'FS_rf' if FS == FS_list[1] else 'FS_forward')
            outdir = "output/data_wine/" + preprocess + '-' + tmp_FS
            print(" == data_wine experiment %s, %s ==" % (preprocess, tmp_FS))
            outputdir = os.path.join(outdir, time.strftime("%Y_%m_%d_%H_%M_%S"))

            np.random.seed(seed)

            start = time.time()
            compute_all(data_wine.iloc[:,:-1], data_wine.iloc[:,-1], preprocess_type=preprocess, 
                     models = models, param_grid = param_grid, outputdir = outputdir, seed = seed, **settings, **FS)

            with open(os.path.join(timedir, "time.csv"), 'a') as ofile:
                ofile.write(preprocess + ";" + tmp_FS + ";" + str(time.time()-start) + "\n")



 == data_clim experiment raw, no_FS ==
Outer CV fold 0
Dataset is already raw.
knn Mean CV accuracy from inner folds on current outerfold:	0.9397
knn test accuracy on current outerfold:				0.9174
knn best parameters on current outerfold:			{'n_neighbors': 5}
rf Mean CV accuracy from inner folds on current outerfold:	0.9420
rf test accuracy on current outerfold:				0.9266
rf best parameters on current outerfold:			{'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9629
kersvm test accuracy on current outerfold:				0.9174
kersvm best parameters on current outerfold:			{'C': 32, 'gamma': 0.125}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9536
linsvm test accuracy on current outerfold:				0.9633
linsvm best parameters on current outerfold:			{'C': 2}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9165
gpc test accuracy on current outerfold:				0.9083
gpc best parameters on c

knn Mean CV accuracy from inner folds on current outerfold:	0.9215
knn test accuracy on current outerfold:				0.9346
knn best parameters on current outerfold:			{'n_neighbors': 5}
rf Mean CV accuracy from inner folds on current outerfold:	0.9330
rf test accuracy on current outerfold:				0.9439
rf best parameters on current outerfold:			{'max_features': None, 'min_samples_leaf': 10, 'n_estimators': 100}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9538
kersvm test accuracy on current outerfold:				0.9720
kersvm best parameters on current outerfold:			{'C': 8, 'gamma': 0.125}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9584
linsvm test accuracy on current outerfold:				0.9626
linsvm best parameters on current outerfold:			{'C': 8}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9145
gpc test accuracy on current outerfold:				0.9159
gpc best parameters on current outerfold:			{'n_restarts_optimizer': 0}
gnb Mean CV accuracy from inner

gpc Mean CV accuracy from inner folds on current outerfold:	0.9329
gpc test accuracy on current outerfold:				0.9352
gpc best parameters on current outerfold:			{'n_restarts_optimizer': 0}
gnb Mean CV accuracy from inner folds on current outerfold:	0.9491
gnb test accuracy on current outerfold:				0.9444
gnb best parameters on current outerfold:			{'priors': None}
gbm Mean CV accuracy from inner folds on current outerfold:	0.9537
gbm test accuracy on current outerfold:				0.9630
gbm best parameters on current outerfold:			{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
logr Mean CV accuracy from inner folds on current outerfold:	0.9514
logr test accuracy on current outerfold:				0.9630
logr best parameters on current outerfold:			{'C': 100}
mlp_nn Mean CV accuracy from inner folds on current outerfold:	0.9537
mlp_nn test accuracy on current outerfold:				0.9537
mlp_nn best parameters on current outerfold:			{'alpha': 1, 'hidden_layer_sizes': (10,), 'solver': 'lbfgs'}
ENSEMB

KNN Feature selection final best features: [ 1  0 13  4]
knn Mean CV accuracy from inner folds on current outerfold:	0.9466
knn test accuracy on current outerfold:				0.8899
knn best parameters on current outerfold:			{'n_neighbors': 1}
rf Mean CV accuracy from inner folds on current outerfold:	0.9466
rf test accuracy on current outerfold:				0.8991
rf best parameters on current outerfold:			{'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9466
kersvm test accuracy on current outerfold:				0.9174
kersvm best parameters on current outerfold:			{'C': 8, 'gamma': 32}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9350
linsvm test accuracy on current outerfold:				0.9174
linsvm best parameters on current outerfold:			{'C': 8}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9211
gpc test accuracy on current outerfold:				0.9266
gpc best parameters on current outerfold:			{'n_

KNN Feature selection final best features: [ 0  1 12 13 16 15]
knn Mean CV accuracy from inner folds on current outerfold:	0.9446
knn test accuracy on current outerfold:				0.9813
knn best parameters on current outerfold:			{'n_neighbors': 3}
rf Mean CV accuracy from inner folds on current outerfold:	0.9469
rf test accuracy on current outerfold:				0.9533
rf best parameters on current outerfold:			{'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 200}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9607
kersvm test accuracy on current outerfold:				0.9252
kersvm best parameters on current outerfold:			{'C': 8, 'gamma': 2}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9584
linsvm test accuracy on current outerfold:				0.9813
linsvm best parameters on current outerfold:			{'C': 8}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9261
gpc test accuracy on current outerfold:				0.9252
gpc best parameters on current outerfold:		

linsvm Mean CV accuracy from inner folds on current outerfold:	0.9606
linsvm test accuracy on current outerfold:				0.9444
linsvm best parameters on current outerfold:			{'C': 0.5}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9167
gpc test accuracy on current outerfold:				0.9074
gpc best parameters on current outerfold:			{'n_restarts_optimizer': 0}
gnb Mean CV accuracy from inner folds on current outerfold:	0.9514
gnb test accuracy on current outerfold:				0.9444
gnb best parameters on current outerfold:			{'priors': None}
gbm Mean CV accuracy from inner folds on current outerfold:	0.9444
gbm test accuracy on current outerfold:				0.9537
gbm best parameters on current outerfold:			{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
logr Mean CV accuracy from inner folds on current outerfold:	0.9630
logr test accuracy on current outerfold:				0.9444
logr best parameters on current outerfold:			{'C': 1}
mlp_nn Mean CV accuracy from inner folds on current outer

RF Feature selection final best features: [ 0  1 12 13]
knn Mean CV accuracy from inner folds on current outerfold:	0.9490
knn test accuracy on current outerfold:				0.9358
knn best parameters on current outerfold:			{'n_neighbors': 5}
rf Mean CV accuracy from inner folds on current outerfold:	0.9513
rf test accuracy on current outerfold:				0.9358
rf best parameters on current outerfold:			{'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 200}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9513
kersvm test accuracy on current outerfold:				0.9358
kersvm best parameters on current outerfold:			{'C': 8, 'gamma': 0.125}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9513
linsvm test accuracy on current outerfold:				0.9450
linsvm best parameters on current outerfold:			{'C': 0.5}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9513
gpc test accuracy on current outerfold:				0.9541
gpc best parameters on current outerfold:			

RF Feature selection final best features: [ 0  1 12 13 14]
knn Mean CV accuracy from inner folds on current outerfold:	0.9423
knn test accuracy on current outerfold:				0.9533
knn best parameters on current outerfold:			{'n_neighbors': 3}
rf Mean CV accuracy from inner folds on current outerfold:	0.9423
rf test accuracy on current outerfold:				0.9533
rf best parameters on current outerfold:			{'max_features': None, 'min_samples_leaf': 5, 'n_estimators': 100}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9561
kersvm test accuracy on current outerfold:				0.9626
kersvm best parameters on current outerfold:			{'C': 32, 'gamma': 0.03125}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9561
linsvm test accuracy on current outerfold:				0.9720
linsvm best parameters on current outerfold:			{'C': 32}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9492
gpc test accuracy on current outerfold:				0.9720
gpc best parameters on current outerfold:

kersvm Mean CV accuracy from inner folds on current outerfold:	0.9630
kersvm test accuracy on current outerfold:				0.9444
kersvm best parameters on current outerfold:			{'C': 2, 'gamma': 0.5}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9514
linsvm test accuracy on current outerfold:				0.9630
linsvm best parameters on current outerfold:			{'C': 2}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9583
gpc test accuracy on current outerfold:				0.9352
gpc best parameters on current outerfold:			{'n_restarts_optimizer': 0}
gnb Mean CV accuracy from inner folds on current outerfold:	0.9491
gnb test accuracy on current outerfold:				0.9444
gnb best parameters on current outerfold:			{'priors': None}
gbm Mean CV accuracy from inner folds on current outerfold:	0.9444
gbm test accuracy on current outerfold:				0.9630
gbm best parameters on current outerfold:			{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
logr Mean CV accuracy from inner folds o

knn Mean CV accuracy from inner folds on current outerfold:	0.9397
knn test accuracy on current outerfold:				0.9174
knn best parameters on current outerfold:			{'n_neighbors': 5}
rf Mean CV accuracy from inner folds on current outerfold:	0.9281
rf test accuracy on current outerfold:				0.9083
rf best parameters on current outerfold:			{'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9629
kersvm test accuracy on current outerfold:				0.9174
kersvm best parameters on current outerfold:			{'C': 32, 'gamma': 0.125}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9536
linsvm test accuracy on current outerfold:				0.9633
linsvm best parameters on current outerfold:			{'C': 2}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9165
gpc test accuracy on current outerfold:				0.9083
gpc best parameters on current outerfold:			{'n_restarts_optimizer': 0}
gnb Mean CV accuracy from inn

rf Mean CV accuracy from inner folds on current outerfold:	0.9215
rf test accuracy on current outerfold:				0.9065
rf best parameters on current outerfold:			{'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9538
kersvm test accuracy on current outerfold:				0.9720
kersvm best parameters on current outerfold:			{'C': 8, 'gamma': 0.125}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9584
linsvm test accuracy on current outerfold:				0.9626
linsvm best parameters on current outerfold:			{'C': 8}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9145
gpc test accuracy on current outerfold:				0.9159
gpc best parameters on current outerfold:			{'n_restarts_optimizer': 0}
gnb Mean CV accuracy from inner folds on current outerfold:	0.9330
gnb test accuracy on current outerfold:				0.9439
gnb best parameters on current outerfold:			{'priors': None}
gbm Mean CV accuracy from inner fo

gpc Mean CV accuracy from inner folds on current outerfold:	0.9144
gpc test accuracy on current outerfold:				0.9074
gpc best parameters on current outerfold:			{'n_restarts_optimizer': 0}
gnb Mean CV accuracy from inner folds on current outerfold:	0.9144
gnb test accuracy on current outerfold:				0.8981
gnb best parameters on current outerfold:			{'priors': None}
gbm Mean CV accuracy from inner folds on current outerfold:	0.9051
gbm test accuracy on current outerfold:				0.9167
gbm best parameters on current outerfold:			{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
logr Mean CV accuracy from inner folds on current outerfold:	0.9259
logr test accuracy on current outerfold:				0.8889
logr best parameters on current outerfold:			{'C': 10}
mlp_nn Mean CV accuracy from inner folds on current outerfold:	0.9236
mlp_nn test accuracy on current outerfold:				0.8704
mlp_nn best parameters on current outerfold:			{'alpha': 1, 'hidden_layer_sizes': (10,), 'solver': 'lbfgs'}
ENSEMB

KNN Feature selection final best features: [ 0  4  5 11]
knn Mean CV accuracy from inner folds on current outerfold:	0.9397
knn test accuracy on current outerfold:				0.9083
knn best parameters on current outerfold:			{'n_neighbors': 5}
rf Mean CV accuracy from inner folds on current outerfold:	0.9258
rf test accuracy on current outerfold:				0.9083
rf best parameters on current outerfold:			{'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 50}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9350
kersvm test accuracy on current outerfold:				0.9174
kersvm best parameters on current outerfold:			{'C': 2, 'gamma': 2}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9165
linsvm test accuracy on current outerfold:				0.9083
linsvm best parameters on current outerfold:			{'C': 0.03125}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9188
gpc test accuracy on current outerfold:				0.9083
gpc best parameters on current outerfold:			

KNN Feature selection final best features: [ 3  7  1 10  4 15 14  2]
knn Mean CV accuracy from inner folds on current outerfold:	0.9307
knn test accuracy on current outerfold:				0.9252
knn best parameters on current outerfold:			{'n_neighbors': 7}
rf Mean CV accuracy from inner folds on current outerfold:	0.9238
rf test accuracy on current outerfold:				0.9159
rf best parameters on current outerfold:			{'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100}
kersvm Mean CV accuracy from inner folds on current outerfold:	0.9446
kersvm test accuracy on current outerfold:				0.9252
kersvm best parameters on current outerfold:			{'C': 32, 'gamma': 0.5}
linsvm Mean CV accuracy from inner folds on current outerfold:	0.9330
linsvm test accuracy on current outerfold:				0.9159
linsvm best parameters on current outerfold:			{'C': 32}
gpc Mean CV accuracy from inner folds on current outerfold:	0.9192
gpc test accuracy on current outerfold:				0.9159
gpc best parameters on current oute

kersvm Mean CV accuracy from inner folds on current outerfold:	0.6278
kersvm test accuracy on current outerfold:				0.6261
kersvm best parameters on current outerfold:			{'C': 0.03125, 'gamma': 0.03125}


KeyboardInterrupt: 