## CROSS VALIDATION

#### REGRESSION WITH KFOLD

In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [12]:
# setup the dataset

data = load_diabetes(as_frame=True)['data']
data['target'] = load_diabetes()['target']

# split X and y
X = data.drop(columns=['target'])
y = data['target']

In [13]:

regressor = RandomForestRegressor(random_state = 23) # init the trainer
k_fold = KFold(n_splits=7, shuffle=True, random_state=23) # init the splitter

split_no = 1
train_scores, test_scores = [],[]
for train_idx, test_idx in k_fold.split(X=X):
    # perform the split.
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # train the model
    regressor.fit(X_train, y_train)
    train_preds = regressor.predict(X_train)
    test_preds = regressor.predict(X_test)
    
    # evaluate model performance.
    train_rmse = root_mean_squared_error(y_train, train_preds)
    test_rmse = root_mean_squared_error(y_test, test_preds)
    train_scores.append(train_rmse)
    test_scores.append(test_rmse)
    
    print(f'split: {split_no}, train rmse: {train_rmse}, test rmse: {test_rmse}')
    split_no += 1
    

print(f'Overall train rmse: {sum(train_scores)/len(train_scores)}')
print(f'Overall test rmse: {sum(test_scores)/len(test_scores)}')

split: 1, train rmse: 22.059752213587252, test rmse: 57.1071868698459
split: 2, train rmse: 21.97390492437177, test rmse: 58.34249560698268
split: 3, train rmse: 22.216241782637113, test rmse: 51.31566869844194
split: 4, train rmse: 21.39282823135239, test rmse: 60.47571318896773
split: 5, train rmse: 21.89140287393474, test rmse: 58.01496101513214
split: 6, train rmse: 21.450596310080694, test rmse: 62.798415585108515
split: 7, train rmse: 21.3992017087808, test rmse: 58.960464773856785
Overall train rmse: 21.769132577820677
Overall test rmse: 58.14498653404795


## CLASSIFICATION WITH STRATIFIED KFOLD

In [9]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

In [14]:
# setup the dataset

data = load_breast_cancer(as_frame=True)['data']
data['target'] = load_breast_cancer()['target']

# split X and y
X = data.drop(columns=['target'])
y = data['target']

In [15]:

classifier = RandomForestClassifier(random_state = 23) # init the trainer
st_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=23) # init the splitter

split_no = 1
train_scores, test_scores = [],[]
for train_idx, test_idx in st_kfold.split(X=X, y=y):
    # perform the split.
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # train the model
    classifier.fit(X_train, y_train)
    train_preds = classifier.predict(X_train)
    test_preds = classifier.predict(X_test)
    
    # evaluate model performance.
    train_f1 = f1_score(y_train, train_preds)
    test_f1 = f1_score(y_test, test_preds)
    train_scores.append(train_f1)
    test_scores.append(test_f1)
    
    print(f'split: {split_no}, train f1: {train_f1}, test f1: {test_f1}')
    split_no += 1
    

print(f'Overall train f1: {sum(train_scores)/len(train_scores)}')
print(f'Overall test f1: {sum(test_scores)/len(test_scores)}')

split: 1, train f1: 1.0, test f1: 0.9784172661870504
split: 2, train f1: 1.0, test f1: 0.9655172413793104
split: 3, train f1: 1.0, test f1: 0.9594594594594594
split: 4, train f1: 1.0, test f1: 0.9793103448275862
split: 5, train f1: 1.0, test f1: 0.971830985915493
Overall train f1: 1.0
Overall test f1: 0.9709070595537799


#### QUICK CROSS VALIDATION WITH CROSS VAL SCORE

In [16]:
from sklearn.model_selection import cross_val_score

In [25]:
score = cross_val_score(estimator=classifier, X=X, y=y,
                scoring='f1', cv=7, n_jobs=1, verbose=4)
score.mean()

[CV] END ................................ score: (test=0.980) total time=   0.5s
[CV] END ................................ score: (test=0.925) total time=   0.4s
[CV] END ................................ score: (test=0.981) total time=   0.3s
[CV] END ................................ score: (test=0.980) total time=   0.5s
[CV] END ................................ score: (test=0.980) total time=   0.5s
[CV] END ................................ score: (test=0.961) total time=   0.3s
[CV] END ................................ score: (test=1.000) total time=   0.3s


0.9724094514438576