In [None]:
dataset_name = 'kidney_disease'
target_column_name = 'classification' 

# Hyperparameter Tuning
This step consists in finding the best parameters to set each model.
## Select Data and Target

In [None]:
import pandas as pd

clean_df = pd.read_csv('Data/'+dataset_name+'/clean_dataset.csv')

all = clean_df.columns.drop(target_column_name)
data_df = clean_df[all]
target_df = clean_df[target_column_name]

## Tuner
Several pre-made solutions already exist to tune these parameters: random search, grid search, halving search, etc... Instead of using one of these, we decided to build our own solution. This solution will find a local optimum without much computation. 

In [None]:
import random
from statistics import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import warnings
import time

def localOptimumSearchCV(model_class, param_grid, X, y, cv_scoring=True, cv=5, test_size=0.2, n_iter_max=100, depth_rate=0.25, max_time=120):
    """Find a local optimum set of parameters for model class  
    To do so, an initial set is improved iteratively by increasing progressively each parameter.
    If the parameter change creates a score improvement, it becomes the set to explore 

    Parameters
    --------
    model_class : class of the model to tune
        provides a ``fit`` and a ``score`` function.

    param_grid : dict
        Dictionary with parameters names (string) as keys and lists of
        parameter settings to try as values

    X : array-like of shape (n_samples, n_features)
        Training vector, where `n_samples` is the number of samples and
        `n_features` is the number of features.

    y : array-like of shape (n_samples, n_output) or (n_samples,)
        Target relative to X for classification or regression.

    cv_scoring : boolean, default=True
        Use cross validation for scoring. If false, train-test scoring is used
    
    cv : int, default=5
        Number of folds to split the dataset into in order to execute cross-validation
    
    test_size : float, default=0.2
        define the 

    n_iter_max : int, default=100
        Number of parameter settings that are tried. If local optimum is found earlier, search is stopped.
    
    depth_rate : int, default=0.25
        Rate from which the depth computed on every parameter before stopping is computed. 
        Each parameter is computed to a depth of max(1,int(depth_rate*nb_parameter_values)).

    max_time : int, default=120
        max execution time (seconds)
        
    Returns
    --------
    (dict, float) : dictionnary of best parameter set found and score of the model with this set
    """
    params = set(param_grid.keys())
    tried_params = set()
    n_iter = 0
    best_score = 0
    empty_params_dict = {}
    max_depth = {}
    computed_param_sets = []
    best_param_indexes = {}
    max_sets = 1
    for param in params:
        max_sets *= len(param_grid[param])
        empty_params_dict[param]=0
        max_depth[param] = max(1,int(depth_rate*len(param_grid[param])))
        best_param_indexes[param] = random.randint(0, len(param_grid[param])-1) 

    if max_sets < n_iter_max:
        for param in params:
            max_depth[param]=len(param_grid[param])
    current_depth = empty_params_dict.copy()

    def indexes_to_params(param_indexes:dict):
        param_dict = {}
        for param in params:
            param_dict[param] = param_grid[param][param_indexes[param]]
        return param_dict

    start_time = time.time()    

    if not cv_scoring :
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    while n_iter < n_iter_max and tried_params != params and best_score != 1 and time.time()-start_time < max_time:
        n_iter += 1
        param = random.choice(tuple(params.difference(tried_params)))
        current_depth[param] += 1
        if current_depth[param] >= max_depth[param]:
            tried_params.add(param)

        param_indexes = best_param_indexes.copy()
        param_indexes[param] = (param_indexes[param]+current_depth[param])%len(param_grid[param])
        if param_indexes not in computed_param_sets:
            computed_param_sets.append(param_indexes)
            test_params = indexes_to_params(param_indexes) 

            model = model_class(**test_params)
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    if cv_scoring:
                        score = mean(cross_val_score(model, X, y, cv=cv))
                    else:
                        model.fit(x_train, y_train)
                        score = model.score(x_test, y_test)
                if score >= best_score:
                    best_score = score
                    best_param_indexes = param_indexes
                    current_depth = empty_params_dict.copy()
                    tried_params = set()
            except:
                max_depth[param] = len(param_grid[param])
                
    return indexes_to_params(best_param_indexes), best_score


## Model tuning application
### Model imports

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

### Parameter grids definition

In [None]:
import numpy as np

model_parameters = []
model_parameters.append({
    'model_class' : KNeighborsClassifier,
    'name': 'KNN',
    'param_grid' : { 
        'n_neighbors' : range(2,20),
        'weights' : ['uniform','distance'],
        'metric' : ['minkowski','euclidean','manhattan']
    }
})
model_parameters.append({
    'model_class' : RandomForestClassifier,
    'name' : 'Random Forest',
    'param_grid' : {
        'n_estimators': [5, 10, 50, 100, 500, 1000],
        'criterion': ['gini', 'entropy'],
        'max_depth': list(range(5, 55, 5))+[None],
        'max_features': ['sqrt', 'log2', None],
        'min_samples_split': range(2, 10),
        'min_impurity_decrease': np.arange(0, 0.1, 0.01),
        'bootstrap': [True, False]
    }
    
})

model_parameters.append({
    'model_class' : DecisionTreeClassifier,
    'name': 'Decision Tree',
    'param_grid' : {
        'criterion': ['gini', 'entropy'],
        'max_depth': list(range(5, 55))+[None],
        'max_features': ['sqrt', 'log2', None],
        'min_samples_split': range(2, 10),
        'min_impurity_decrease': np.arange(0, 0.1, 0.01)
    }
})

model_parameters.append({
    'model_class': LogisticRegression,
    'name': 'Logistic Regression',
    'param_grid': {
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'penalty': ['l1', 'l2', 'elasticnet', None],
        'C': [100, 10, 1.0, 0.1, 0.01],
        'max_iter': [1000]
    }
})

model_parameters.append({
    'model_class': GaussianNB,
    'name': 'Naive Bayes',
    'param_grid': {'var_smoothing': [1e-12, 1e-11, 1e-10, 1e-9, 1e-8]}
})

model_parameters.append({
    'model_class': LinearDiscriminantAnalysis,
    'name': 'LDA',
    'param_grid': {
        'solver' : ['svd', 'lsqr', 'eigen'],
        'shrinkage': [None, 'auto', 0, 0.5, 1]
    }
})

### Optimized parameters computation

In [None]:
import json

optimized_parameters = []
for model in model_parameters:
    best_params, best_score = localOptimumSearchCV(model['model_class'], model['param_grid'], data_df, target_df,cv_scoring=False, n_iter_max=500, cv=5)
        
    print(model['name'].ljust(25), best_score)
    optimized_parameters.append({
        'model_class_name'  : model['model_class'].__name__,
        'best_params'       : best_params,
        'best_score'        : best_score,
        'name'              : model['name']
    })

file = open('Data/'+dataset_name+'/tuned_hyperparameters.json', 'w')
json.dump(optimized_parameters, file)
file.close()