#hyperparameter tuning
types:
1. Grid Search  >> runs all the parameter combinations given to it
2. Random Search  >> runs randomly some(not all) of the parameters given to it 
3. Bayesian Optimization
4. Gradient_based Optimization


#Cross Validation
diff combination of train test split
cross validation and hyperparameter tuning go side by side along

#GridSearchCV

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#load the data
from sklearn.datasets import load_iris   # its not a dataframe so .head() will not work

iris = load_iris()
X= iris.data        #data already sep. into features and target
y= iris.target

In [5]:
%%time
#define the model
model= RandomForestClassifier()

#create the parameter grid
param_grid = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    #"max_features": ['auto', 'sqrt', 'log2'],
    "max_depth": [4,5,6,7,8,9,10],
    #'criterion': ['gini', 'entropy']
}

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,                      #cross validation
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

#fit the model
grid.fit(X,y)

#print the best parameters
print(f"Best parameters: {grid.best_params_}")



Fitting 5 folds for each of 42 candidates, totalling 210 fits
Best parameters: {'max_depth': 4, 'n_estimators': 50}


In [4]:
%%time
#define the model
model= RandomForestClassifier()

#create the parameter grid
param_grid = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    #"max_features": ['auto', 'sqrt', 'log2'],
    "max_depth": [4,5,6,7,8,9,10],
    'criterion': ['gini', 'entropy']
}

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

#fit the model
grid.fit(X,y)

#print the best parameters
print(f"Best parameters: {grid.best_params_}")

Fitting 5 folds for each of 84 candidates, totalling 420 fits
Best parameters: {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 100}
CPU times: total: 1.89 s
Wall time: 2min 49s


In [6]:
%%time
#define the model
model= RandomForestClassifier()

#create the parameter grid
param_grid = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    #"max_features": ['auto', 'sqrt', 'log2'],
    "max_depth": [4,5,6,7,8,9,10],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

#fit the model
grid.fit(X,y)

#print the best parameters
print(f"Best parameters: {grid.best_params_}")

Fitting 5 folds for each of 168 candidates, totalling 840 fits
Best parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 4, 'n_estimators': 50}
CPU times: total: 3.64 s
Wall time: 5min 22s


#Random Search

In [4]:
%%time
#define the model
model= RandomForestClassifier()

#create the parameter grid
param_grid = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    #"max_features": ['auto', 'sqrt', 'log2'],
    "max_depth": [4,5,6,7,8,9,10],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

grid = RandomizedSearchCV(    #used RandomizedSearchCV
    estimator=model,
    param_distributions=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,
    n_iter=20
)

#fit the model
grid.fit(X,y)

#print the best parameters
print(f"Best parameters: {grid.best_params_}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'n_estimators': 400, 'max_depth': 7, 'criterion': 'gini', 'bootstrap': True}
CPU times: total: 1.31 s
Wall time: 56.9 s
