### CROSS VALIDATION RECAP

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,classification_report
import pandas as pd
import numpy as np

In [2]:
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

In [19]:
# initialize kfold object

kfold = KFold(n_splits= 5, random_state= 23, shuffle= True)

# init stratified kfold object
s_fold = StratifiedKFold(n_splits=5, random_state= 23, shuffle=True)

In [21]:
classifier = RandomForestClassifier(random_state= 23)
score = cross_val_score(estimator=classifier,X=X, y=y,
                        scoring='f1',n_jobs=1, verbose = 3,cv=s_fold.split(X, y))

[CV] END ................................ score: (test=0.978) total time=   0.6s
[CV] END ................................ score: (test=0.966) total time=   0.7s
[CV] END ................................ score: (test=0.959) total time=   0.5s
[CV] END ................................ score: (test=0.979) total time=   0.9s
[CV] END ................................ score: (test=0.972) total time=   0.8s


In [10]:
score

array([0.93617021, 0.95890411, 0.99300699, 0.97931034, 0.98571429])

### HYPER-PARAMETER TUNING/OPTIMIZATION

1. GridSearch 
2. Randomized Search
3. Bayesian Optimization

In [22]:
# create a function

def train_evaluate(X, y, **params):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                        random_state=23, stratify=y)
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    test_preds = model.predict(X_test)
    test_score = f1_score(y_test, test_preds)
    train_preds = model.predict(X_train)
    train_score = f1_score(y_train, train_preds)
    
    return test_score, train_score


train_evaluate(X=X, y=y, n_estimators= 20, max_depth = 5, random_state = 23)
    

(0.950354609929078, 0.9982486865148862)

In [30]:
n_estimators = [10,15,20,25,30]
max_depth = [2,4,6,8]

score_params = {}
for estimator in n_estimators:
    for depth in max_depth:
        test_score, train_score = train_evaluate(X=X, y=y, 
                                                 n_estimators = estimator,
                                                 max_depth = depth)
        score_params.update({test_score:f'n_estimator:{estimator}, max_depth:{depth}'})
        print(f'for estimator:{estimator}, max_depth: {depth}...train:{train_score},test_score:{test_score}')

score_params[max(list(score_params.keys()))]

for estimator:10, max_depth: 2...train:0.9651567944250871,test_score:0.9444444444444444
for estimator:10, max_depth: 4...train:0.9912739965095986,test_score:0.9726027397260274
for estimator:10, max_depth: 6...train:0.9982425307557118,test_score:0.951048951048951
for estimator:10, max_depth: 8...train:0.9982425307557118,test_score:0.9361702127659575
for estimator:15, max_depth: 2...train:0.96875,test_score:0.9523809523809523
for estimator:15, max_depth: 4...train:0.9895104895104895,test_score:0.951048951048951
for estimator:15, max_depth: 6...train:1.0,test_score:0.9655172413793104
for estimator:15, max_depth: 8...train:1.0,test_score:0.9655172413793104
for estimator:20, max_depth: 2...train:0.9722222222222222,test_score:0.9655172413793104
for estimator:20, max_depth: 4...train:0.993006993006993,test_score:0.951048951048951
for estimator:20, max_depth: 6...train:0.9982425307557118,test_score:0.9863013698630136
for estimator:20, max_depth: 8...train:0.9982486865148862,test_score:0.965986

'n_estimator:20, max_depth:6'

In [29]:
score_params[0.9659863945578231]

'n_estimator:25, max_depth:8'

### GRID SEARCH

In [31]:
# init the model 
classifier = RandomForestClassifier(random_state=23)

# init the kfold object
s_fold = StratifiedKFold(n_splits=5, random_state= 23, shuffle=True)

params = {
    'n_estimators': list(range(10,100, 10)),
    'max_depth': list(range(2,11, 2)),
    'criterion': ['gini','entropy']
}

model = GridSearchCV(estimator=classifier, param_grid=params,
                     scoring='f1',n_jobs=1,cv = s_fold.split(X,y),verbose=3,
                     return_train_score=True)

model.fit(X=X, y=y)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.962, test=0.964) total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.964, test=0.945) total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.965, test=0.940) total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.964, test=0.951) total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=2, n_estimators=10;, score=(train=0.964, test=0.958) total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=2, n_estimators=20;, score=(train=0.961, test=0.978) total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=2, n_estimators=20;, score=(train=0.971, test=0.952) total time=   0.2s
[CV 3/5] END criterion=gini, max_depth=2, n_estimators=20;, score=(train=0.976, test=0.947) total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=2, n_estima

In [32]:
model.best_params_

{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 90}

In [33]:
model.best_score_

0.9725173015538034

### Random Search

In [4]:
# init the model 
classifier = RandomForestClassifier(random_state=23)

# init the kfold object
s_fold = StratifiedKFold(n_splits=5, random_state= 23, shuffle=True)

params = {
    'n_estimators': list(range(10,100, 10)),
    'max_depth': list(range(2,11, 2)),
    'criterion': ['gini','entropy']
}

model = RandomizedSearchCV(estimator=classifier,param_distributions=params,
                           n_iter=50,scoring='f1',n_jobs=1,random_state=23,
                           verbose=3, cv = s_fold.split(X,y))

model.fit(X=X, y=y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.986 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.959 total time=   0.7s
[CV 3/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.952 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.979 total time=   0.5s
[CV 5/5] END criterion=gini, max_depth=6, n_estimators=90;, score=0.972 total time=   0.6s
[CV 1/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.978 total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.952 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.940 total time=   0.1s
[CV 4/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.946 total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=2, n_estimators=20;, score=0.958 total time=   0.0s
[CV 1/5] END 

### HYPERPARAMETER TUNING (OCT COHORT)

In [2]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import KFold,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.datasets import load_iris

### MANUAL APPROACH TO HYPERPARAMETER TUNING

In [32]:
#load the dataset

X = load_iris(as_frame=True)['data']
y = load_iris()['target']

# split the dataset
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y,
                                                   random_state=0)

parameters = {
    'n_estimators': 10,
    'max_depth': 3,
    'criterion': 'gini',
    'random_state': 23
}

def predict_and_score(**params):
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_score = f1_score(y_train, train_pred, average='weighted')
    test_score = f1_score(y_test, test_pred, average='weighted')
    print(f'train_score: {train_score}....test score {test_score}')
    
    return model

model = predict_and_score(**parameters)

train_score: 0.974996093139553....test score 0.9326599326599326


In [30]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 20,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 23,
 'verbose': 0,
 'warm_start': False}

### GRID SEARCH

In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import KFold,GridSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.datasets import load_breast_cancer

In [2]:
#load the dataset
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

# init the cross val object
sk_fold = StratifiedKFold(n_splits=5, random_state=23, shuffle=True)

parameters = {
    'n_estimators': list(range(10,51,10)),
    'max_depth': list(range(1,6)),
    'criterion': ['gini','entropy','log_loss']
}

# init the model
classifier = RandomForestClassifier(random_state=23)

# set up the grid
model = GridSearchCV(estimator=classifier, param_grid=parameters,
                     scoring='f1',n_jobs=1, cv = sk_fold.split(X,y),
                     verbose=3)
#fit the model on the dataset
model.fit(X, y)


Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END criterion=gini, max_depth=1, n_estimators=10;, score=0.972 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, n_estimators=10;, score=0.927 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, n_estimators=10;, score=0.934 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, n_estimators=10;, score=0.920 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, n_estimators=10;, score=0.939 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=1, n_estimators=20;, score=0.979 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, n_estimators=20;, score=0.933 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, n_estimators=20;, score=0.934 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, n_estimators=20;, score=0.940 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, n_estimators=20;, score=0.939 total time=   0.0s
[CV 1/5] END criterion=gini,

In [3]:
model.best_score_

0.971250973577171

In [4]:
model.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 30}

### RANDOMIZED SEARCH



In [7]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import KFold,RandomizedSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.datasets import load_diabetes

In [9]:
#load the dataset
X = load_diabetes(as_frame=True)['data']
y = load_diabetes()['target']

# init the cross val object
k_fold = KFold(n_splits=5, random_state=23, shuffle=True)

parameters = {
    'n_estimators': list(range(10,100,10)),
    'max_depth': list(range(1,10)),
    'criterion': ['poisson', 'absolute_error', 'friedman_mse', 'squared_error']
}

# init the model
regressor = RandomForestRegressor(random_state=23)

# set up the grid
model = RandomizedSearchCV(estimator=regressor,param_distributions=parameters,
                           n_iter=50, scoring='neg_root_mean_squared_error',cv=k_fold.split(X),
                           random_state=23,verbose=3,n_jobs=1)
#fit the model on the dataset
model.fit(X, y)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END criterion=friedman_mse, max_depth=3, n_estimators=40;, score=-56.862 total time=   0.2s
[CV 2/5] END criterion=friedman_mse, max_depth=3, n_estimators=40;, score=-52.645 total time=   0.1s
[CV 3/5] END criterion=friedman_mse, max_depth=3, n_estimators=40;, score=-60.237 total time=   0.1s
[CV 4/5] END criterion=friedman_mse, max_depth=3, n_estimators=40;, score=-55.626 total time=   0.4s
[CV 5/5] END criterion=friedman_mse, max_depth=3, n_estimators=40;, score=-59.085 total time=   0.3s
[CV 1/5] END criterion=poisson, max_depth=1, n_estimators=30;, score=-64.273 total time=   0.0s
[CV 2/5] END criterion=poisson, max_depth=1, n_estimators=30;, score=-57.725 total time=   0.0s
[CV 3/5] END criterion=poisson, max_depth=1, n_estimators=30;, score=-63.473 total time=   0.0s
[CV 4/5] END criterion=poisson, max_depth=1, n_estimators=30;, score=-61.194 total time=   0.0s
[CV 5/5] END criterion=poisson, max_depth=1, n_es

In [None]:
name = {
    'wale': 'adebayo',
    'tola': 'balogun'
}


def greet(**args):
    return args

greet()

In [18]:
def average():
    list_number = []
    while True:
        number = float(input('enter number or 0 to stop: '))
        if number != 0:
            list_number.append(number)
        else:
            break
    final_average = sum(list_number)/len(list_number)   
    return final_average

In [23]:
def real_average(*nums):
    fina_average = sum(nums)/len(nums)
    return fina_average

real_average(2,3,4)

3.0

In [24]:
def real_average(**nums):
    return nums

real_average()

{'a': 1, 'b': 2, 'c': 3}

In [25]:
test_dict = {
    'a':1, 'b':2, 'c':3}

In [28]:
real_average(**test_dict)

{'a': 1, 'b': 2, 'c': 3}