In [None]:
# loading the libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.metrics import f1_score, classification_report,root_mean_squared_error
from sklearn.datasets import load_breast_cancer, load_diabetes
from xgboost import XGBClassifier, XGBRegressor

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

### Classification


In [None]:
# load and prepare the dataset

X = load_breast_cancer(as_frame = True)['data']
y = load_breast_cancer()['target']

In [None]:
# experiment with train test split

def base_model(X: pd.DataFrame, y:np.array):

  #split the dataset for the base model
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 10,
                                                      stratify= y)
  # init the model
  model = XGBClassifier(random_state = 23)
  model.fit(X_train, y_train)
  train_preds = model.predict(X_train)
  test_preds = model.predict(X_test)

  print(f'train_f1: {f1_score(y_train, train_preds)}')
  print(f'test_f1: {f1_score(y_test, test_preds)}')

  return X_train


base_model(X, y)

train_f1: 1.0
test_f1: 0.993006993006993


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
149,13.740,17.91,88.12,585.0,0.07944,0.06376,0.028810,0.013290,0.1473,0.05580,...,15.34,22.46,97.19,725.9,0.09711,0.18240,0.156400,0.06019,0.2350,0.07014
130,12.190,13.29,79.08,455.8,0.10660,0.09509,0.028550,0.028820,0.1880,0.06471,...,13.34,17.81,91.38,545.2,0.14270,0.25850,0.099150,0.08187,0.3469,0.09241
88,12.360,21.80,79.78,466.1,0.08772,0.09445,0.060150,0.037450,0.1930,0.06404,...,13.83,30.50,91.46,574.7,0.13040,0.24630,0.243400,0.12050,0.2972,0.09261
131,15.460,19.48,101.70,748.9,0.10920,0.12230,0.146600,0.080870,0.1931,0.05796,...,19.26,26.00,124.90,1156.0,0.15460,0.23940,0.379100,0.15140,0.2837,0.08019
254,19.450,19.33,126.50,1169.0,0.10350,0.11880,0.137900,0.085910,0.1776,0.05647,...,25.70,24.57,163.10,1972.0,0.14970,0.31610,0.431700,0.19990,0.3379,0.08950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,13.460,18.75,87.44,551.1,0.10750,0.11380,0.042010,0.031520,0.1723,0.06317,...,15.35,25.16,101.90,719.8,0.16240,0.31240,0.265400,0.14270,0.3518,0.08665
66,9.465,21.01,60.11,269.4,0.10440,0.07773,0.021720,0.015040,0.1717,0.06899,...,10.41,31.56,67.03,330.7,0.15480,0.16640,0.094120,0.06517,0.2878,0.09211
58,13.050,19.31,82.61,527.2,0.08060,0.03789,0.000692,0.004167,0.1819,0.05501,...,14.23,22.25,90.24,624.1,0.10210,0.06191,0.001845,0.01111,0.2439,0.06289
204,12.470,18.60,81.09,481.9,0.09965,0.10580,0.080050,0.038210,0.1925,0.06373,...,14.97,24.64,96.05,677.9,0.14260,0.23780,0.267100,0.10150,0.3014,0.08750


### Simple Cross Validation

In [None]:
# get cross validation score

model = XGBClassifier(random_state = 23) # init the model
score_array = cross_val_score(estimator= model, X = X, y=y,
                              scoring='f1', cv = 5, verbose = 3)
score_array.mean()

[CV] END ................................ score: (test=0.972) total time=   0.1s
[CV] END ................................ score: (test=0.966) total time=   0.1s
[CV] END ................................ score: (test=0.993) total time=   0.1s
[CV] END ................................ score: (test=0.973) total time=   0.1s
[CV] END ................................ score: (test=0.978) total time=   0.1s


0.9762943362967487

### Regression Problem

In [None]:
X_diabetes = load_diabetes(as_frame = True)['data']
y_diabetes = load_diabetes()['target']

regressor = XGBRegressor(random_state = 23)
score_array = -cross_val_score(estimator= regressor, X = X_diabetes, y=y_diabetes,
                              scoring='neg_root_mean_squared_error', cv = 5, verbose = 3)
score_array.mean()

[CV] END .............................. score: (test=-55.685) total time=   0.1s
[CV] END .............................. score: (test=-58.185) total time=   0.1s
[CV] END .............................. score: (test=-68.623) total time=   0.1s
[CV] END .............................. score: (test=-64.153) total time=   0.1s
[CV] END .............................. score: (test=-68.483) total time=   0.1s


63.02569539868239

### KFOLD

In [None]:
# 1. initialize the model if you have not already done that.

reg_kfold = KFold(n_splits = 10, shuffle = True, random_state = 23)

score_array = -cross_val_score(estimator = regressor, X = X_diabetes, y = y_diabetes,cv = reg_kfold,
                              scoring = 'neg_root_mean_squared_error', verbose = 3)
score_array.mean()

[CV] END .............................. score: (test=-65.640) total time=   0.3s
[CV] END .............................. score: (test=-69.617) total time=   0.1s
[CV] END .............................. score: (test=-62.051) total time=   0.1s
[CV] END .............................. score: (test=-54.074) total time=   0.1s
[CV] END .............................. score: (test=-66.653) total time=   0.1s
[CV] END .............................. score: (test=-64.405) total time=   0.1s
[CV] END .............................. score: (test=-56.873) total time=   0.2s
[CV] END .............................. score: (test=-65.889) total time=   0.1s
[CV] END .............................. score: (test=-73.538) total time=   0.1s
[CV] END .............................. score: (test=-63.313) total time=   0.1s


64.20522487343786

In [None]:
# doing kfold manually
# init kfold if you have not done it already

X_diabetes = load_diabetes()['data']
y_diabetes = load_diabetes()['target']

train_scores = [] # empty list to track scores
test_scores = []

for train_index, test_index in reg_kfold.split(X_diabetes):

  X_train, X_test = X_diabetes[train_index], X_diabetes[test_index]
  y_train, y_test = y_diabetes[train_index], y_diabetes[test_index]

  # fit the model
  regressor.fit(X_train, y_train)

  # predict train and test outcome
  train_preds = regressor.predict(X_train)
  test_preds = regressor.predict(X_test)

  train_rmse = root_mean_squared_error(y_train, train_preds)
  test_rmse = root_mean_squared_error(y_test, test_preds)
  train_scores.append(train_rmse)
  test_scores.append(test_rmse)

  print(f'train score: {train_rmse}.......test_score: {test_rmse}')




print(sum(test_scores)/len(test_scores))

train score: 0.2518646247890971.......test_score: 65.64005057860217
train score: 0.22870373420318255.......test_score: 69.61728523515806
train score: 0.24443841356542212.......test_score: 62.05080793348758
train score: 0.20319816183900494.......test_score: 54.07362371288405
train score: 0.25551314284218146.......test_score: 66.65255112423816
train score: 0.21141320801453264.......test_score: 64.40490203975659
train score: 0.1723150027869488.......test_score: 56.873280876271636
train score: 0.22607588154589134.......test_score: 65.88906712535284
train score: 0.25395757385100876.......test_score: 73.53801448302482
train score: 0.2151946363912345.......test_score: 63.31266562560267
64.20522487343786


# Assignment


Perform Kfold cross val with random forest classifier on Iris dataset

### Cross validation (oct cohort)

In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import f1_score,root_mean_squared_error
from sklearn.datasets import load_breast_cancer,load_diabetes

In [17]:
# create function that splits the model

def split_data(data, target, seed:int) -> None:
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        target, random_state=seed, stratify=target)
    return X_train, X_test, y_train, y_test


def model_score_fuction(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(random_state=23)
    model.fit(X_train,y_train)
    test_preds = model.predict(X_test)
    score = root_mean_squared_error(y_test, test_preds)
    return score


# load the data
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']



In [6]:
# split, fit and score 
X_train, X_test, y_train, y_test = split_data(X, y,seed=5)
model_score_fuction(X_train, X_test, y_train, y_test)

0.9830508474576272

### IMPLEMENTING KFOLD FROM SCRATCH

In [21]:
# initialize the kfold object

kfold = KFold(n_splits=10,shuffle=True, random_state=23)

# load the dataset
X = load_diabetes(as_frame=True)['data']
y = load_diabetes()['target']

# begin the training loop
overall_score = []
counter = 0
for train_index, test_index in kfold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    score = model_score_fuction(X_train,X_test, y_train, y_test)
    overall_score.append(score)
    counter += 1
    print(f'Performance for iteration {counter}: {score}')
    

average_score = sum(overall_score)/len(overall_score)
print(f'Average score: {average_score}')

Performance for iteration 1: 60.42447932934318
Performance for iteration 2: 58.2259683570064
Performance for iteration 3: 48.995165647420585
Performance for iteration 4: 51.48864938905555
Performance for iteration 5: 61.72637330480089
Performance for iteration 6: 62.224396578395634
Performance for iteration 7: 55.66823976184103
Performance for iteration 8: 58.973930430780555
Performance for iteration 9: 69.33267881939027
Performance for iteration 10: 53.68017695320381
Average score: 58.074005857123794


In [22]:
### STRATIFIED KFOLD
# initialize the kfold object

st_kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=23)

# load the dataset
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

# begin the training loop
overall_score = []
counter = 0
for train_index, test_index in st_kfold.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RandomForestClassifier(random_state=23)
    model.fit(X_train, y_train)
    test_preds = model.predict(X_test)
    score = f1_score(y_test, test_preds)
    overall_score.append(score)
    counter += 1
    print(f'Performance for iteration {counter}: {score}')
    

average_score = sum(overall_score)/len(overall_score)
print(f'Average score: {average_score}')

Performance for iteration 1: 0.9855072463768116
Performance for iteration 2: 0.9552238805970149
Performance for iteration 3: 0.972972972972973
Performance for iteration 4: 0.958904109589041
Performance for iteration 5: 0.9863013698630136
Performance for iteration 6: 0.972972972972973
Performance for iteration 7: 0.972972972972973
Performance for iteration 8: 0.9577464788732394
Performance for iteration 9: 0.9577464788732394
Performance for iteration 10: 0.9859154929577465
Average score: 0.9706263976049024


### USING CROSS VAL SCORE 

In [32]:
# load the dataset
X = load_breast_cancer(as_frame=True)['data']
y = load_breast_cancer()['target']

# initialize the kfold object
kfold = KFold(n_splits=10,shuffle=True, random_state=23)

# init the model
model = RandomForestClassifier(random_state=23)

# set up the cross val
score = cross_val_score(estimator=model, X = X, y = y, cv = kfold.split(X,y),
                        scoring='f1',n_jobs= 1, verbose=3)

[CV] END ................................ score: (test=0.974) total time=   0.6s
[CV] END ................................ score: (test=0.972) total time=   1.2s
[CV] END ................................ score: (test=1.000) total time=   0.7s
[CV] END ................................ score: (test=0.870) total time=   0.7s
[CV] END ................................ score: (test=1.000) total time=   0.6s
[CV] END ................................ score: (test=1.000) total time=   0.5s
[CV] END ................................ score: (test=0.947) total time=   0.6s
[CV] END ................................ score: (test=1.000) total time=   0.6s
[CV] END ................................ score: (test=0.986) total time=   0.6s
[CV] END ................................ score: (test=0.972) total time=   0.6s


In [31]:
score.mean()

0.9721273857159236