### CROSS VALIDATION RECAP

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,classification_report
import pandas as pd
import numpy as np

In [2]:
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

In [19]:
# initialize kfold object

kfold = KFold(n_splits= 5, random_state= 23, shuffle= True)

# init stratified kfold object
s_fold = StratifiedKFold(n_splits=5, random_state= 23, shuffle=True)

In [21]:
classifier = RandomForestClassifier(random_state= 23)
score = cross_val_score(estimator=classifier,X=X, y=y,
                        scoring='f1',n_jobs=1, verbose = 3,cv=s_fold.split(X, y))

[CV] END ................................ score: (test=0.978) total time=   0.6s
[CV] END ................................ score: (test=0.966) total time=   0.7s
[CV] END ................................ score: (test=0.959) total time=   0.5s
[CV] END ................................ score: (test=0.979) total time=   0.9s
[CV] END ................................ score: (test=0.972) total time=   0.8s


In [10]:
score

array([0.93617021, 0.95890411, 0.99300699, 0.97931034, 0.98571429])

### HYPER-PARAMETER TUNING/OPTIMIZATION

1. GridSearch 
2. Randomized Search
3. Bayesian Optimization

In [22]:
# create a function

def train_evaluate(X, y, **params):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                        random_state=23, stratify=y)
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    test_preds = model.predict(X_test)
    test_score = f1_score(y_test, test_preds)
    train_preds = model.predict(X_train)
    train_score = f1_score(y_train, train_preds)
    
    return test_score, train_score


train_evaluate(X=X, y=y, n_estimators= 20, max_depth = 5, random_state = 23)
    

(0.950354609929078, 0.9982486865148862)

In [30]:
n_estimators = [10,15,20,25,30]
max_depth = [2,4,6,8]

score_params = {}
for estimator in n_estimators:
    for depth in max_depth:
        test_score, train_score = train_evaluate(X=X, y=y, 
                                                 n_estimators = estimator,
                                                 max_depth = depth)
        score_params.update({test_score:f'n_estimator:{estimator}, max_depth:{depth}'})
        print(f'for estimator:{estimator}, max_depth: {depth}...train:{train_score},test_score:{test_score}')

score_params[max(list(score_params.keys()))]

for estimator:10, max_depth: 2...train:0.9651567944250871,test_score:0.9444444444444444
for estimator:10, max_depth: 4...train:0.9912739965095986,test_score:0.9726027397260274
for estimator:10, max_depth: 6...train:0.9982425307557118,test_score:0.951048951048951
for estimator:10, max_depth: 8...train:0.9982425307557118,test_score:0.9361702127659575
for estimator:15, max_depth: 2...train:0.96875,test_score:0.9523809523809523
for estimator:15, max_depth: 4...train:0.9895104895104895,test_score:0.951048951048951
for estimator:15, max_depth: 6...train:1.0,test_score:0.9655172413793104
for estimator:15, max_depth: 8...train:1.0,test_score:0.9655172413793104
for estimator:20, max_depth: 2...train:0.9722222222222222,test_score:0.9655172413793104
for estimator:20, max_depth: 4...train:0.993006993006993,test_score:0.951048951048951
for estimator:20, max_depth: 6...train:0.9982425307557118,test_score:0.9863013698630136
for estimator:20, max_depth: 8...train:0.9982486865148862,test_score:0.965986

'n_estimator:20, max_depth:6'

In [29]:
score_params[0.9659863945578231]

'n_estimator:25, max_depth:8'

### GRID SEARCH

In [31]:
# init the model 
classifier = RandomForestClassifier(random_state=23)

# init the kfold object
s_fold = StratifiedKFold(n_splits=5, random_state= 23, shuffle=True)

params = {
    'n_estimators': list(range(10,100, 10)),
    'max_depth': list(range(2,11, 2)),
    'criterion': ['gini','entropy']
}

model = GridSearchCV(estimator=classifier, param_grid=params,
                     scoring='f1',n_jobs=1,cv = s_fold.split(X,y),verbose=3,
                     return_train_score=True)

model.fit(X=X, y=y)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.962, test=0.964) total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.964, test=0.945) total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.965, test=0.940) total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.964, test=0.951) total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.964, test=0.958) total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=2, n_estimators=20;, score=(train=0.961, test=0.978) total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=2, n_estimators=20;, score=(train=0.971, test=0.952) total time=   0.2s
[CV 3/5] END criterion=gini, max_depth=2, n_estimators=20;, score=(train=0.976, test=0.947) total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=2, n_estima

In [32]:
model.best_params_

{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 90}

In [33]:
model.best_score_

0.9725173015538034

### Random Search

In [4]:
# init the model 
classifier = RandomForestClassifier(random_state=23)

# init the kfold object
s_fold = StratifiedKFold(n_splits=5, random_state= 23, shuffle=True)

params = {
    'n_estimators': list(range(10,100, 10)),
    'max_depth': list(range(2,11, 2)),
    'criterion': ['gini','entropy']
}

model = RandomizedSearchCV(estimator=classifier,param_distributions=params,
                           n_iter=50,scoring='f1',n_jobs=1,random_state=23,
                           verbose=3, cv = s_fold.split(X,y))

model.fit(X=X, y=y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.986 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.959 total time=   0.7s
[CV 3/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.952 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.979 total time=   0.5s
[CV 5/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.972 total time=   0.6s
[CV 1/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.978 total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.952 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.940 total time=   0.1s
[CV 4/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.946 total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.958 total time=   0.0s
[CV 1/5] END 