# K-Fold
a technique to use all our data to train and test our model

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Advertising.csv')

In [3]:
X = df.drop('sales', axis=1)
y = df['sales']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# we use k-fold (class validation technique on our train data)
# the evaluation part will be automatically done by k-fold and there is no need to divide our data into 3 portions

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=100)

# k-fold
from sklearn.model_selection import cross_val_score
# cross_val_score will run our model on our dataset like k-fold automatically(here is 5-fold)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
# it will divide our data to 5 portions and do the job(4 portion training and 1 test and other rounds...)
# each time it compute an error and will add it to the list
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [7]:
# total error is the mean of that
abs(scores.mean())

np.float64(8.215396464543607)

In [8]:
# tuning the model
model2 = Ridge(alpha=1)

scores2 = cross_val_score(model2, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
abs(scores2).mean()

np.float64(3.344839296530695)

In [9]:
# to evaluate our model
# we must fit X_train  first
model2.fit(X_train, y_train)
y_final = model2.predict(X_test)

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_final)

2.319021579428752

# using some other scoring...

In [21]:
from sklearn.model_selection import cross_validate

model = Ridge(alpha=1)

scores = cross_validate(model, X_train, y_train, cv=15, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'])
scores

{'fit_time': array([0.00172043, 0.00522566, 0.00077367, 0.00080252, 0.00064802,
        0.00077271, 0.00062561, 0.00057292, 0.00053072, 0.00053334,
        0.00057268, 0.00052428, 0.0005846 , 0.00055575, 0.00052166]),
 'score_time': array([0.00260735, 0.00135469, 0.00074053, 0.00067616, 0.00059152,
        0.00055957, 0.00055432, 0.00055838, 0.00053287, 0.00051379,
        0.0005219 , 0.00051618, 0.00054026, 0.00052404, 0.00051355]),
 'test_neg_mean_squared_error': array([ -1.93238105,  -4.61031075,  -1.90316345,  -2.69103093,
         -1.08859315,  -1.65221882,  -3.26213661, -12.6807873 ,
         -0.77379545,  -1.61436564,  -1.36887608,  -4.30298333,
         -2.74585322,  -6.833069  ,  -2.28920924]),
 'test_neg_mean_absolute_error': array([-1.13939928, -1.96517653, -1.19659645, -1.4228595 , -0.92451424,
        -0.99534887, -1.32136674, -2.17159688, -0.65041865, -1.06441188,
        -0.81701199, -1.69802359, -1.08321978, -2.10061901, -0.94654938])}

In [18]:
# there are 15 rows (cv was 15)
scores = pd.DataFrame(scores)
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.001376,0.000729,-1.932381,-1.139399
1,0.000649,0.000571,-4.610311,-1.965177
2,0.000876,0.000579,-1.903163,-1.196596
3,0.000687,0.000551,-2.691031,-1.422859
4,0.000622,0.0006,-1.088593,-0.924514
5,0.000776,0.000688,-1.652219,-0.995349
6,0.000707,0.000617,-3.262137,-1.321367
7,0.000673,0.000559,-12.680787,-2.171597
8,0.00058,0.000528,-0.773795,-0.650419
9,0.000605,0.000619,-1.614366,-1.064412


In [19]:
scores.mean()

fit_time                        0.000698
score_time                      0.000583
test_neg_mean_squared_error    -3.316585
test_neg_mean_absolute_error   -1.299808
dtype: float64

In [20]:
model.fit(X_train, y_train)
y_final = model.predict(X_test)

mean_squared_error(y_test, y_final)

2.319021579428752