# hold-out

In [31]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
df = sns.load_dataset('tips')

In [2]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [32]:
y_col = 'tip'
X = df.drop(columns=[y_col])

#　標準化のために数値カラムのリストを取得
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()

X = pd.get_dummies(X, drop_first=True, dtype=int)
y = df[y_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [36]:
print(len(X_train))
print(len(X_test))

170
74


In [45]:
# 標準化
# 標準化は、データ分割の後に実施する
from sklearn.preprocessing import StandardScaler

# 数値カラムのみ標準化
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

X_test_scaled = X_test.copy()
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [48]:
# 線形回帰のモデル学習
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)


In [52]:
# モデルの精度
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred) # np.mean(np.square(y_test - y_pred))

0.9550808988617148

# LOOCV

In [58]:
# データ準備
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip']

In [59]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()

In [63]:
mse_list = []
model = LinearRegression()
for train_index, test_index in list(loo.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデルの学習
    model.fit(X_train, y_train)
    
    #　予測
    y_pred = model.predict(X_test)
    
    #評価(MSE)
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [66]:
print(f"MSE(LOOCV)：{np.mean(mse_list)}")

MSE(LOOCV)：1.0675673489857438


In [70]:
from sklearn.model_selection import cross_val_score
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv = cv, scoring='neg_mean_squared_error')

print(f"MSE(LOOCV)：{np.mean(-scores)}")

MSE(LOOCV)：1.0675673489857438


## K-FoldCV

In [98]:
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
k = 5
n_repeats = 3

mse_list = []

model = LinearRegression()
# cv = KFold(n_splits=k, shuffle=True)
cv = RepeatedKFold(n_splits=k, n_repeats=n_repeats, random_state=0)

for train_index, test_index in cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデル学習
    model.fit(X_train, y_train)
    
    # テストデータ予想
    y_pred = model.predict(X_test)
    
    # MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [99]:
print(f"MSE({k}FoldCV : {np.mean(mse_list)})")
print(f"std: {np.std(mse_list)}")

MSE(5FoldCV : 1.0746387233165982)
std: 0.2651717854089844


In [100]:
mse_list

[0.8213090642766285,
 1.0745842125927976,
 1.0880123892600388,
 1.3323867714930204,
 1.084763004349474,
 1.1587839131131425,
 1.6042084002514578,
 1.0307086207441924,
 0.7120290668798743,
 0.8472985410140897,
 0.8856103319481908,
 1.5248521639391936,
 0.6332659028150582,
 1.200354200262607,
 1.121414266809207]

In [101]:
scores = cross_val_score(model, X, y, cv=cv, verbose=True, scoring='neg_mean_squared_error', n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    2.7s finished


In [102]:
-scores

array([0.82130906, 1.07458421, 1.08801239, 1.33238677, 1.084763  ,
       1.15878391, 1.6042084 , 1.03070862, 0.71202907, 0.84729854,
       0.88561033, 1.52485216, 0.6332659 , 1.2003542 , 1.12141427])

## Pipeline + KFold

In [109]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('mdoel', LinearRegression())])

cv = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=cv)
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ])

## Piplineなし

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LinearRegression()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred

array([2.71486884, 2.78639251, 2.90900452, 1.65836207, 2.57999564,
       1.50509707, 2.74858715, 3.30136293, 2.77208778, 4.45800284,
       3.50060744, 3.49345507, 2.35520697, 2.24587793, 2.28879213,
       4.02375199, 1.77075641, 2.3480546 , 2.83645908, 3.2778623 ,
       3.98901192, 3.05511716, 2.55240794, 2.45431834, 2.29798803,
       2.59327861, 2.16004953, 3.96244599, 3.50162921, 2.5289073 ,
       2.42264357, 2.19274606, 2.49314547, 1.99963215, 2.78639251,
       2.28572683, 2.64743224, 1.97306622, 5.85577969, 2.55036441,
       1.79425705, 2.18763723, 2.52073317, 3.96755482, 2.22135553,
       2.65151931, 2.78128368, 3.12255376, 2.66173698, 3.66409011,
       4.2567148 , 2.74552185, 3.01118119, 5.83943142, 1.89847725,
       2.14676656, 3.97572896, 3.03161652, 2.37462053, 2.21113786,
       3.70496078, 2.53299437, 3.07963956, 3.47199797, 3.99718606,
       2.5043849 , 2.60043097, 4.2720413 , 1.97306622, 3.87763935,
       2.4890584 , 1.99145802, 3.43010554, 2.37972937])

## pipelineあり

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred

array([2.71486884, 2.78639251, 2.90900452, 1.65836207, 2.57999564,
       1.50509707, 2.74858715, 3.30136293, 2.77208778, 4.45800284,
       3.50060744, 3.49345507, 2.35520697, 2.24587793, 2.28879213,
       4.02375199, 1.77075641, 2.3480546 , 2.83645908, 3.2778623 ,
       3.98901192, 3.05511716, 2.55240794, 2.45431834, 2.29798803,
       2.59327861, 2.16004953, 3.96244599, 3.50162921, 2.5289073 ,
       2.42264357, 2.19274606, 2.49314547, 1.99963215, 2.78639251,
       2.28572683, 2.64743224, 1.97306622, 5.85577969, 2.55036441,
       1.79425705, 2.18763723, 2.52073317, 3.96755482, 2.22135553,
       2.65151931, 2.78128368, 3.12255376, 2.66173698, 3.66409011,
       4.2567148 , 2.74552185, 3.01118119, 5.83943142, 1.89847725,
       2.14676656, 3.97572896, 3.03161652, 2.37462053, 2.21113786,
       3.70496078, 2.53299437, 3.07963956, 3.47199797, 3.99718606,
       2.5043849 , 2.60043097, 4.2720413 , 1.97306622, 3.87763935,
       2.4890584 , 1.99145802, 3.43010554, 2.37972937])