### CROSS VALIDATION

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_diabetes
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score, KFold

In [2]:
X = load_diabetes(as_frame=True)['data']
y = load_diabetes()['target']

## Simple Cross Val_score

In [11]:
regressor = RandomForestRegressor(random_state=23) # init the regressor
final_rmse = -cross_val_score(estimator=regressor, X=X, y=y,
                scoring="neg_root_mean_squared_error", cv=10, n_jobs=1,
                verbose= 5).mean()
final_rmse

[CV] END .............................. score: (test=-61.336) total time=   0.7s
[CV] END .............................. score: (test=-54.310) total time=   0.6s
[CV] END .............................. score: (test=-56.690) total time=   0.3s
[CV] END .............................. score: (test=-56.586) total time=   0.3s
[CV] END .............................. score: (test=-60.699) total time=   0.4s
[CV] END .............................. score: (test=-62.237) total time=   0.4s
[CV] END .............................. score: (test=-56.959) total time=   0.4s
[CV] END .............................. score: (test=-53.505) total time=   0.3s
[CV] END .............................. score: (test=-70.570) total time=   0.3s
[CV] END .............................. score: (test=-50.917) total time=   0.4s


58.381004649030636

### classification

In [21]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score, KFold

In [22]:
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

In [None]:
regressor = RandomForestClassifier(random_state=23) # init the regressor
final_score = cross_val_score(estimator=regressor, X=X, y=y,
                scoring="f1", cv=10, n_jobs=1,
                verbose= 3).mean()
final_score

[CV] END ................................ score: (test=0.986) total time=   0.4s
[CV] END ................................ score: (test=0.901) total time=   0.3s
[CV] END ................................ score: (test=0.959) total time=   0.4s
[CV] END ................................ score: (test=0.960) total time=   0.4s
[CV] END ................................ score: (test=1.000) total time=   0.6s
[CV] END ................................ score: (test=0.986) total time=   0.2s
[CV] END ................................ score: (test=0.986) total time=   0.2s
[CV] END ................................ score: (test=0.986) total time=   0.2s
[CV] END ................................ score: (test=0.971) total time=   0.2s
[CV] END ................................ score: (test=0.986) total time=   0.2s


0.9721273857159236

### KFOLD AND STRATIFIED KFOLD FROM SCRATCH

In [43]:
from sklearn.metrics import f1_score

# init the kfold object
folds = 8
fold = 1
rfr_kfold = KFold(n_splits=folds, shuffle=True, random_state=23)
scores = [] # track score list
# init the model
classifier = RandomForestClassifier(random_state=23, n_estimators=2)

for train_index, test_index in rfr_kfold.split(X=X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # fit the model
    classifier.fit(X_train, y_train)
    test_preds = classifier.predict(X_test) # predict test
    train_preds = classifier.predict(X_train)
    score = f1_score(y_test, test_preds) # get the test score
    train_score = f1_score(y_train, train_preds) # get the test score
    scores.append(score) # update the scores
    print(f'Completed Fold: {fold}/{folds}.... test score: {score}, train score: {train_score}')
    fold += 1
    
    
print(f'scores: {scores}')
print('=' * 60)
print(f'final_score: {sum(scores)/len(scores)}')

Completed Fold: 1/8.... test score: 0.946236559139785, train score: 0.9818780889621087
Completed Fold: 2/8.... test score: 0.9438202247191011, train score: 0.9701986754966887
Completed Fold: 3/8.... test score: 0.8888888888888888, train score: 0.9873817034700315
Completed Fold: 4/8.... test score: 0.9195402298850575, train score: 0.9820554649265906
Completed Fold: 5/8.... test score: 0.9397590361445783, train score: 0.9787928221859706
Completed Fold: 6/8.... test score: 0.9111111111111111, train score: 0.9768211920529801
Completed Fold: 7/8.... test score: 0.9382716049382716, train score: 0.9585406301824212
Completed Fold: 8/8.... test score: 0.8941176470588236, train score: 0.9855072463768116
scores: [0.946236559139785, 0.9438202247191011, 0.8888888888888888, 0.9195402298850575, 0.9397590361445783, 0.9111111111111111, 0.9382716049382716, 0.8941176470588236]
final_score: 0.9227181627357022


In [None]:
### COMBINING KFOLD WITH CROSS VALIDATION WITHOUT SHUFFLE

rf_kfold = KFold(n_splits=9)

regressor = RandomForestClassifier(random_state=23) # init the regressor
final_score = cross_val_score(estimator=regressor, X=X, y=y,
                scoring="f1", cv=rf_kfold.split(X), n_jobs=1,
                verbose= 3).mean()
final_score

[CV] END ................................ score: (test=0.889) total time=   0.6s
[CV] END ................................ score: (test=0.921) total time=   0.3s
[CV] END ................................ score: (test=0.949) total time=   0.2s
[CV] END ................................ score: (test=0.986) total time=   0.2s
[CV] END ................................ score: (test=0.966) total time=   0.2s
[CV] END ................................ score: (test=0.988) total time=   0.2s
[CV] END ................................ score: (test=0.970) total time=   0.2s
[CV] END ................................ score: (test=0.979) total time=   0.4s
[CV] END ................................ score: (test=1.000) total time=   0.3s


0.9607395775699012

In [39]:
type(X)

pandas.core.frame.DataFrame

In [37]:
nums = [4,5,6,7,8]
X.iloc[nums]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072


In [33]:
def greet(name):
    return f'HI {name} how are you?'
    
first_greeting = greet('bola')

In [34]:
print(first_greeting)

HI bola how are you?


In [35]:
first_greeting

'HI bola how are you?'