In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.datasets import load_iris

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

import os
os.getcwd()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import tqdm
import time

In [2]:
df = pd.read_csv("/home/saraiva/Documentos/Dataframes/travel-insurance/travel_insurance.csv")
df = load_iris()

X = pd.DataFrame(data = df.data,columns = df.feature_names)
y = pd.DataFrame(df.target)

In [3]:
param_grid  = {'bootstrap' :[True,False], 
               'max_depth' : list(range(2, 10)),
               'max_features' : ['auto','sqrt','log2'],
               'min_samples_leaf': list(range(5,20)), 
               'n_estimators': list(range(30, 300 ,5))}

In [4]:
algo = RandomForestClassifier
metric_list = [accuracy_score]

In [5]:
def random_objective(params,
                     algo,
                     metric_list,
                     X, y,
                     iterations = 100):
    """Random search objective function.
    Takes in :
    params : Space which we will randomize.
    algo : Algoritm name without instanciating the object
    metric_list : A list of metrics which we will evaluate
    iterations : Number of times we will iterate over RandomSearch
    
    Return : A dataFrame with each line being an round(observation). and we will have in the columns
    the parameters of the space and the evalution metric"""
    
    #Defining output dataframe columns
    param_col = [key for key in param_grid.keys()] + [metrics.__name__ for metrics in metric_list]
    
    lots_vals = list()
    for rounds in tqdm.tqdm_notebook(range(iterations)):
        #Aleatoriza os parametros
        params = { key : random.sample(value,1)[0] for key,value in param_grid.items()}

        # Perform n_folds cross validation
        kfolds = KFold(n_splits=10,shuffle=True)

        matrix_metrics = []
        for train_idx,test_idx in kfolds.split(X):

            X_train,X_test = X.loc[train_idx,:],X.loc[test_idx,:]
            y_train,y_test = y.loc[train_idx,:],y.loc[test_idx,:]
            
            new_algo = algo(**params)

            new_algo.fit(X_train,np.ravel(y_train))
            y_pred = new_algo.predict(X_test)


            evaluation_list = []
            for evaluations in metric_list:
                result = evaluations(y_test,y_pred)
                evaluation_list.append(result)
            matrix_metrics.append(evaluation_list)

        #Dataframe criado com os resultados das metricas em cada fold
        aux_metrics = pd.DataFrame(matrix_metrics,columns = [metrics.__name__ for metrics in metric_list])
        metrics_results = aux_metrics.mean().tolist()


        # Return list of results
        vals = list(params.values()) + metrics_results
    
        lots_vals.append(vals)
    
    return pd.DataFrame(lots_vals,columns = param_col)

In [7]:
algo

sklearn.ensemble.forest.RandomForestClassifier

In [6]:
random_objective(param_grid,algo,metric_list,X,y)

HBox(children=(IntProgress(value=0), HTML(value='')))




Unnamed: 0,bootstrap,max_depth,max_features,min_samples_leaf,n_estimators,accuracy_score
0,True,4,auto,11,215,0.947
1,False,9,log2,18,165,0.927
2,False,8,log2,11,180,0.947
3,False,4,sqrt,8,145,0.940
4,True,2,log2,7,110,0.920
5,False,6,sqrt,13,70,0.947
6,False,6,auto,19,255,0.933
7,True,3,log2,8,50,0.953
8,True,5,auto,18,160,0.933
9,True,4,auto,17,240,0.933
