In [1]:
"""
Problem :  Search for a optimal set of hyperparameters for a learning algorithm
Solution: Find a set of optimal hyperparameters that result in an optimal model
Optimal model uyields an optimal score
Score: In Sklearn defaults to accuracy (Classification) and R2 (Regression)
CrossValidation is required to estimate the generalization performance

"""

'\nProblem :  Search for a optimal set of hyperparameters for a learning algorithm\nSolution: Find a set of optimal hyperparameters that result in an optimal model\nOptimal model uyields an optimal score\nScore: In Sklearn defaults to accuracy (Classification) and R2 (Regression)\nCrossValidation is required to estimate the generalization performance\n\n'

In [2]:
"""
Why to tune hyperparameter ?
Default parameter in sklearn are not optimal

Approaches to Hyperparameter tuning:
GridSearch
RandomSearch
Bayesian Optimization
Generic Algorithms

"""


'\nWhy to tune hyperparameter ?\nDefault parameter in sklearn are not optimal\n\nApproaches to Hyperparameter tuning:\nGridSearch\nRandomSearch\nBayesian Optimization\nGeneric Algorithms\n\n'

In [11]:
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_diabetes
import pandas as pd

#import models and utility functions
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

# import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [5]:
breast = load_breast_cancer()
print(breast.keys())
print (breast.target[[10, 50, 85]])
print (breast.target_names)


X = breast.data
y = breast.target

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
[0 1 0]
['malignant' 'benign']


In [6]:
# set seed
SEED=123

In [7]:
# splitting of data into 80% train and 20% test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=SEED)

In [8]:
# Instantiate Model 

# Instantiate Classification Tree dt with max_depth=4 ,  min_samples_leaf=0.15
dt = DecisionTreeClassifier(max_depth=4,min_samples_leaf=0.15,random_state=SEED)

In [10]:
# print out Decision tree's dt's hyperparameters
print(dt.get_params())

{'class_weight': None, 'criterion': 'gini', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 0.15, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': 123, 'splitter': 'best'}


In [15]:
# Define a dictionary of parameters that will be put as an input to GridSearchCV

params_dt={
    'max_depth' : [2,3,4,5,6,7,8],
    'min_samples_leaf': [0.2,0.3,0.4,0.5],
    'max_features' : [0.2,0.3,0.4,0.5,0.6,0.8]  
}

In [16]:
# Instantiate a GrdiSearchCV object with the above params

grid_dt = GridSearchCV(estimator=dt,param_grid=params_dt,scoring='accuracy',cv=10,n_jobs=-1)

In [17]:
# fit grid_dt to the training set
grid_dt.fit(X_train,y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=4,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=0.15,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=123,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8],
                         'max_features': [0.2, 0.3, 0.4, 0.5, 0.6, 0.8],
    

In [24]:
# find the best set of hyperparameters

print(grid_dt.best_params_)

{'max_depth': 2, 'max_features': 0.2, 'min_samples_leaf': 0.3}


In [19]:
# predict on the test set
y_pred = grid_dt.predict(X_test)

In [25]:
# best CV score
print(grid_dt.best_score_)

0.9164835164835164


In [29]:
# best model
best_model = grid_dt.best_estimator_
print(grid_dt.best_estimator_)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=0.2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')


In [30]:
# find the test set accuracy
test_acc = best_model.score(X_test,y_test)

In [31]:
print(test_acc)

0.9473684210526315
