In [202]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [203]:
dataframe = pd.read_csv("student_scores.csv")
dataframe.head()

Unnamed: 0,Hours,Scores
0,2.5,21
1,5.1,47
2,3.2,27
3,8.5,75
4,3.5,30


In [204]:
dataframe.shape

(25, 2)

In [205]:
X = dataframe[["Hours"]]
y = dataframe["Scores"]

In [206]:
X.shape

(25, 1)

In [207]:
SEED = 25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [208]:
LinReg = LinearRegression()
LinReg.fit(X_train, y_train)

In [209]:
print(f"train_score: {LinReg.score(X_train, y_train)}")
print(f"test_score: {LinReg.score(X_test, y_test)}")

train_score: 0.9526743156185514
test_score: 0.9434635261365243


In [210]:
y_pred = LinReg.predict(X_test)
y_pred = LinReg.predict(X_test)
compare = pd.DataFrame({"y_test" : y_test, "y_pred" : y_pred})
compare

Unnamed: 0,y_test,y_pred
2,27,34.039712
9,25,29.26891
17,24,21.635627
10,85,76.97693
21,54,49.306278


In [211]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 5.28
Mean squared error: 31.95
Root mean squared error: 5.65


In [212]:
kf =KFold(n_splits=10, shuffle=True, random_state=15)

cnt = 1
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {train_index}, Test set:{test_index}')
    cnt += 1

Fold:1, Train set: [ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20 21 23 24], Test set:[ 2 16 22]
Fold:2, Train set: [ 0  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23], Test set:[ 1  6 24]
Fold:3, Train set: [ 0  1  2  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 20 21 22 24], Test set:[ 3 18 23]
Fold:4, Train set: [ 0  1  2  3  5  6  7  8 10 11 12 13 15 16 17 18 19 20 21 22 23 24], Test set:[ 4  9 14]
Fold:5, Train set: [ 0  1  2  3  4  5  6  7  8  9 11 12 13 14 15 16 17 18 21 22 23 24], Test set:[10 19 20]
Fold:6, Train set: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 14 15 16 18 19 20 21 22 23 24], Test set:[13 17]
Fold:7, Train set: [ 0  1  2  3  4  5  6  7  8  9 10 12 13 14 16 17 18 19 20 21 22 23 24], Test set:[11 15]
Fold:8, Train set: [ 1  2  3  4  5  6  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24], Test set:[0 7]
Fold:9, Train set: [ 0  1  2  3  4  6  7  8  9 10 11 13 14 15 16 17 18 19 20 21 22 23 24], Test set:[ 5 12]
Fold:10, Train set: [ 0  1  2 

In [213]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

In [214]:
score = cross_val_score(linear_model.LinearRegression(), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-28.69253662 -33.81587744 -59.64336163 -26.55920572 -29.48537279
 -40.01760273 -24.36622322 -26.80787077 -19.68637311 -14.78218729]
rmse= 5.51


**K-Fold methode brings both simplicity and quantity to the table while all the data being used. This methode seems to be the fastes way to check the model also. The crucial moment for determining the methode was the quantity of data and its type. The best result was with 10 splits and random state 15 (rmse = 5.51) against 5 splits and random state 15 (rmse = 5.83), 10 splits and random state 25 (rmse = 5.82), 5 splits and random state 25 (rmse = 5.88).**