In [None]:
import os
import pandas as pd
import numpy as np
from collections import Counter
import copy

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

# Nested cross-validation for hyperparameter tuning

In [None]:
def tune(X, y, n_jobs=1, random_state=1):
    # X: array, data of selected features
    # y: array, outcome
    
    # list of classifiers and hyperparameters
    lst_clf = {}
    lst_grid = {}

    ############## RF ####################################
    clf = RandomForestClassifier(n_jobs=n_jobs, random_state=random_state, class_weight='balanced')
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 7)]
    # The function to measure the quality of a split
    criterion = ['gini', 'entropy']
    # Number of features to consider at every split
    max_features = ['sqrt','log2',0.2,0.4]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]

    # Create the random grid
    random_grid = {'clf__n_estimators': n_estimators,
                   'clf__max_features': max_features,
                   'clf__criterion': criterion,
                   'clf__min_samples_leaf': min_samples_leaf}
    lst_clf['rf'] = clf
    lst_grid['rf'] = random_grid

    ############## SVM ####################################
    clf = SVC(random_state=random_state, class_weight='balanced')
    # Regularization parameter C
    C = [100, 10, 1, 0.1, 0.001]
    # kernel type to be used
    kernel = ['linear','poly', 'rbf','sigmoid']
    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
    gamma = ['scale','auto',0.001,0.1, 1, 10]

    # Create the random grid
    random_grid = {'clf__C': C,
                   'clf__kernel': kernel,
                   'clf__gamma': gamma}

    lst_clf['svm'] = clf
    lst_grid['svm'] = random_grid

    ############## Bagging tree ####################################
    clf = BaggingClassifier(random_state=random_state, n_jobs=n_jobs)

    # The number of base estimators in the ensemble.
    n_estimators = [10,100,500,1000]
    # The number of features to draw from X to train each base estimator 
    max_features = [1,10]

    # Create the random grid
    random_grid = {'clf__n_estimators': n_estimators,
                   'clf__max_features': max_features}

    lst_clf['bagging_tree'] = clf
    lst_grid['bagging_tree'] = random_grid
    
    ############## Bagging svm ####################################
    clf = BaggingClassifier(SVC(class_weight='balanced', random_state=random_state), 
                            random_state=random_state, n_jobs=n_jobs)
    # The number of base estimators in the ensemble.
    n_estimators = [10,100,500,1000]
    # The number of features to draw from X to train each base estimator 
    max_features = [1,10,20]

    # Create the random grid
    random_grid = {'clf__n_estimators': n_estimators,
                   'clf__max_features': max_features}

    lst_clf['bagging_svm'] = clf
    lst_grid['bagging_svm'] = random_grid
    
    
    for key, clf in lst_clf.items():
        print(key)
        pipeline = Pipeline([('scale', StandardScaler()),
                             ("smote", SMOTE(random_state=random_state, n_jobs=n_jobs)), 
                             ("clf", clf)])
        # pipeline.get_params().keys()
    
        grid = lst_grid[key]
        df_best_params = pd.DataFrame()
        
        for train_index, test_index in kf.split(X, y):
            # split data
            X_train = X_[train_index]
            y_train = y_[train_index]
                    
            X_test = X_[test_index]
            y_test = y_[test_index]
            
            # # scaling for test data
            # scaler = StandardScaler()
            # scaler.fit(X_train)
            # X_train_scaled = scaler.transform(X_train)
            # X_test_scaled = scaler.transform(X_test)
            
            search = RandomizedSearchCV(
                pipeline, grid, scoring="balanced_accuracy", n_iter=100, 
                n_jobs=n_jobs, cv=kf, random_state=random_state,return_train_score=True
            ).fit(X_train, y_train)
            
            search.cv_results_['mean_train_score']
            
            best_score = search.best_score_
            print(f"Best Tuning balanced accuracy: {best_score}")
            
            best_params = {
                key.replace("clf__", ""): value for key, value in search.best_params_.items()
            }
            
            best_estimator = search.best_estimator_
            y_pred = best_estimator.predict(X_test)
            score_test = metrics.balanced_accuracy_score(y_test, y_pred)
            print(f"Best estimator balanced accuracy in test dataset: {score_test}")
            
            df_ = pd.DataFrame.from_dict(best_params, orient='index').transpose()
            df_['val_score'] = best_score
            df_['test_score'] = score_test
            df_best_params = df_best_params.append(df_, 
                                                   ignore_index=True)
        
        df_best_params.columns = [x.replace('clf__','') for x in df_best_params.columns]
        
        # lst_opt[key] = lst_opt[key].append(df_best_params, ignore_index=True)
 

        file_out = 'params_tune_'+key+'.csv'
        df_best_params.to_csv(os.path.join('P:\\project\\Guy\\coLive\\results\\openSmile\\classification\\type1\\binary_follow_up_0_m4a_5kf_ini6473_rfe\\var200_step30',
                                            file_out))
        # we choose the haperparameters mostly voted