## CROSS VALIDATION

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,cross_val_score
from sklearn.metrics import f1_score, root_mean_squared_error
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

## Regression with KFOLD

#### Naive train test split

In [23]:
# load and split the dataset

X = load_diabetes(as_frame=True)['data']
y = load_diabetes()['target']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [18]:
# model training

model = RandomForestRegressor(random_state=23, n_estimators=2)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

In [19]:
# evaluate the score

print(f"train rmse: {root_mean_squared_error(y_train, train_preds)}")
print(f"test rmse: {root_mean_squared_error(y_test, test_preds)}")

train rmse: 36.82683811659267
test rmse: 73.57685349881794


#### Kfold

In [53]:
diabetes_kfold = KFold(n_splits=8, shuffle=True, random_state=23)
split = 1
train_scores = []
test_scores = []

for train_idx, test_idx in diabetes_kfold.split(X=X):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = RandomForestRegressor(n_estimators=3, random_state=23, max_depth=4)
    model.fit(X_train, y_train)
    train_preds, test_preds = model.predict(X_train), model.predict(X_test)
    train_rmse = root_mean_squared_error(y_train, train_preds)
    test_rmse = root_mean_squared_error(y_test, test_preds)

    print(f"split: {split}: train rmse: {train_rmse}, test rmse: {test_rmse}")
    train_scores.append(train_rmse)
    test_scores.append(test_rmse)
    split +=1

print('-'* 70)
overall_train = sum(train_scores)/len(train_scores)
overall_test = sum(test_scores)/len(test_scores)
print(f"Training completed with overall score of: train = {overall_train} and test = {overall_test}")


split: 1: train rmse: 0.1491072528512155, test rmse: 0.21897448762423852
split: 2: train rmse: 0.12585390547794284, test rmse: 0.19944626645756813
split: 3: train rmse: 0.10994405682664558, test rmse: 0.28351168715622993
split: 4: train rmse: 0.13405634459617588, test rmse: 0.20250831704809355
split: 5: train rmse: 0.1338891490747675, test rmse: 0.2108154065705727
split: 6: train rmse: 0.15190630526815788, test rmse: 0.24033650131525347
split: 7: train rmse: 0.132129447514845, test rmse: 0.12595065662027577
split: 8: train rmse: 0.12937892332228623, test rmse: 0.25638798235598803
----------------------------------------------------------------------
Training completed with overall score of: train = 0.13328317311650456 and test = 0.21724141314352752


### CROSS-VAL SCORE

In [57]:
regressor = RandomForestRegressor(n_estimators=3, random_state=23, max_depth=4)
score = cross_val_score(estimator=regressor, X=X, y=y, scoring= 'neg_root_mean_squared_error',
                        cv=6,n_jobs=1, verbose=4)

[CV] END ............................... score: (test=-0.313) total time=   0.0s
[CV] END ............................... score: (test=-0.276) total time=   0.0s
[CV] END ............................... score: (test=-0.170) total time=   0.0s
[CV] END ............................... score: (test=-0.182) total time=   0.0s
[CV] END ............................... score: (test=-0.211) total time=   0.0s
[CV] END ............................... score: (test=-0.170) total time=   0.0s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished


In [59]:
-score.mean()

np.float64(0.22028148632798408)

In [61]:
score.std()

np.float64(0.055209480075507834)

### STRATIFIED KFOLD

In [42]:
# load and split the dataset

X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

In [44]:
breast_fold = StratifiedKFold(n_splits=8, shuffle=True, random_state=23)
split = 1
train_scores = []
test_scores = []

for train_idx, test_idx in breast_fold.split(X=X, y=y):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = RandomForestClassifier(n_estimators=3, random_state=23, max_depth=4)
    model.fit(X_train, y_train)
    train_preds, test_preds = model.predict(X_train), model.predict(X_test)
    train_f1 = f1_score(y_train, train_preds)
    test_f1 = f1_score(y_test, test_preds)

    print(f"split: {split}: train f1: {train_f1}, test f1: {test_f1}")
    train_scores.append(train_f1)
    test_scores.append(test_f1)
    split +=1

print('*'* 70)
overall_train = sum(train_scores)/len(train_scores)
overall_test = sum(test_scores)/len(test_scores)
print(f"Training completed with overall score of: train = {overall_train} and test = {overall_test}")


split: 1: train f1: 0.9730586370839936, test f1: 0.9777777777777777
split: 2: train f1: 0.987220447284345, test f1: 0.9545454545454546
split: 3: train f1: 0.9840764331210191, test f1: 0.967032967032967
split: 4: train f1: 0.9857369255150554, test f1: 0.9565217391304348
split: 5: train f1: 0.9794628751974723, test f1: 0.9361702127659575
split: 6: train f1: 0.9841269841269841, test f1: 0.9473684210526315
split: 7: train f1: 0.9841269841269841, test f1: 0.9565217391304348
split: 8: train f1: 0.9741100323624595, test f1: 0.9545454545454546
**********************************************************************
Training completed with overall score of: train = 0.9814899148522891 and test = 0.9563104707476391


In [45]:
classifier = RandomForestClassifier(n_estimators=3, random_state=23, max_depth=4)
score = cross_val_score(estimator=classifier, X=X, y=y, scoring= 'f1',
                        cv=6,n_jobs=1, verbose=3)

[CV] END ................................ score: (test=0.974) total time=   0.0s
[CV] END ................................ score: (test=0.928) total time=   0.0s
[CV] END ................................ score: (test=0.960) total time=   0.0s
[CV] END ................................ score: (test=0.974) total time=   0.0s
[CV] END ................................ score: (test=0.967) total time=   0.0s
[CV] END ................................ score: (test=0.947) total time=   0.0s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished


In [52]:
score.mean().item()

0.9584588394062079