In [None]:
import os
import pandas as pd
import numpy as np
from collections import Counter
import copy

from imblearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [None]:
def tune(X_train, y_train, X_test, y_test, folder_out, n_jobs=1, random_state=1):
    # X_train: array, training dataset which was used to select the features
    # y_train: array, binary outcome with 0 and 1
    # X_test: array, test dataset
    # y_test: array, binary outcome with 0 and 1
    # folder: string, folder to save the results
    
    # list of classifiers and hyperparameters
    lst_clf = {}
    lst_grid = {}

    ##### Random forest #####
    clf = RandomForestClassifier(n_jobs=n_jobs, random_state=random_state, class_weight='balanced')
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 7)]
    # The function to measure the quality of a split
    criterion = ['gini', 'entropy']
    # Number of features to consider at every split
    max_features = ['sqrt','log2',0.2,0.4]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]

    # Create the random grid
    random_grid = {'clf__n_estimators': n_estimators,
                   'clf__max_features': max_features,
                   'clf__criterion': criterion,
                   'clf__min_samples_leaf': min_samples_leaf}
    lst_clf['rf'] = clf
    lst_grid['rf'] = random_grid

    ##### Support vector machine #####
    clf = SVC(random_state=random_state, class_weight='balanced')
    # Regularization parameter C
    C = [100, 10, 1, 0.1, 0.001]
    # kernel type to be used
    kernel = ['linear','poly', 'rbf','sigmoid']
    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
    gamma = ['scale','auto',0.001,0.1, 1, 10]

    # Create the random grid
    random_grid = {'clf__C': C,
                   'clf__kernel': kernel,
                   'clf__gamma': gamma}

    lst_clf['svm'] = clf
    lst_grid['svm'] = random_grid

    ##### Bagging tree #####
    clf = BaggingClassifier(random_state=random_state, n_jobs=n_jobs)

    # The number of base estimators in the ensemble.
    n_estimators = [10,100,500,1000]
    # The number of features to draw from X to train each base estimator 
    max_features = [1,10]

    # Create the random grid
    random_grid = {'clf__n_estimators': n_estimators,
                   'clf__max_features': max_features}

    lst_clf['bagging_tree'] = clf
    lst_grid['bagging_tree'] = random_grid
    
    ##### MLP #####
    from sklearn.neural_network import MLPClassifier
    clf = MLPClassifier(random_state=random_state, max_iter=3000)

    hidden_layer_sizes = [(10,),(20,)]
    activation = ['tanh', 'relu']
    solver = ['sgd', 'adam']
    alpha = [0.0001, 0.001, 0.01]
    learning_rate = ['constant','adaptive']

    # Create the random grid
    random_grid = {
        'clf__hidden_layer_sizes': hidden_layer_sizes,
        'clf__activation': activation,
        'clf__solver': solver,
        'clf__alpha': alpha,
        'clf__learning_rate': learning_rate,
    }

    lst_clf['mlp'] = clf
    lst_grid['mlp'] = random_grid
    
    ##### hyperparameter tuning on training dataset #####
    for key, clf in lst_clf.items():
        print(key)
        pipeline = Pipeline([('scale', StandardScaler()),
                                # ("smote", SMOTE(random_state=random_state, n_jobs=n_jobs)), 
                              ("clf", clf)])
        # # pipeline.get_params().keys()

        grid = lst_grid[key]
        # df_best_params = pd.DataFrame()

        search = RandomizedSearchCV(
            pipeline, grid, scoring=metrics.make_scorer(metrics.matthews_corrcoef), n_iter=100,random_state=random_state, ###############
            n_jobs=n_jobs, cv=kf,return_train_score=True
        ).fit(X_train, y_train)

        best_score = search.best_score_
        print(f"Best Tuning MCC: {best_score}") ###############

        best_params = {
            key: value for key, value in search.best_params_.items()
        }
        print(best_params)

        best_estimator = search.best_estimator_
        y_pred = best_estimator.predict(X_test)
        score_test = metrics.matthews_corrcoef(y_test, y_pred) ##################
        print(f"Best estimator MCC in test dataset: {score_test}") ###################

        df_ = pd.DataFrame.from_dict(best_params, orient='index').transpose()
        df_['val_score'] = best_score
        df_['test_score'] = score_test

        path_out = os.path.join(folder_out, key+'.csv')
        df_.to_csv(path_out, index=False)