# hold-out でデータを分割

In [80]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import seaborn as sns
df = sns.load_dataset('tips')

In [81]:
y_col = 'tip'
X = df.drop(columns=y_col)
# 標準化のために数値カラムのリストを取得　
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()
X = pd.get_dummies(X, drop_first=True)
y = df[y_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [82]:
# 標準化は必ずデータをsplitしてから行うこと。なぜなら、X_trainだけを標準化したいから
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#標準化するのは数値のカラムのみ
X_train_scaled = X_train.copy()
#数値カラムのみ標準化
scaler.fit(X_train[numeric_cols])
X_train_scaled[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_test_scaled = X_test.copy()
#すでに、trainデータでscalerを学習させているので再度、X_testで学習させる意味はないし、してはいけない
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [83]:
#線形回帰モデル学習
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [84]:
# 精度確認(MSE)  モデルの精度
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

0.955080898861715

In [85]:
# 標準化されていないデータで線形回帰モデル学習
model_1 = LinearRegression()
model_1.fit(X_train, y_train)
y_pred_1 = model_1.predict(X_test)

In [86]:
#精度確認
mean_squared_error(y_test, y_pred_1)

0.9550808988617131

# LOOCVで線形回帰を学習・評価する

## LOO Leave One Out

In [87]:
#データ準備
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip']
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()

In [88]:
#cross validation行う
model = LinearRegression()
mse_list = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #モデル学習
    model.fit(X_train, y_train)
    #テストでーたの予測
    y_pred = model.predict(X_test)
    #MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [89]:
print(np.mean(mse_list))

1.0675673489857438


In [90]:
#上のやり方では、コードを書くのが手間なので一発で処理する方法がある
from sklearn.model_selection import cross_val_score
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
print(-np.mean(scores))

1.0675673489857438


# 一般的に使用される評価方法k-foldCV

In [91]:
from sklearn.model_selection import KFold
k = 5
cv = KFold(n_splits=k, shuffle=True, random_state=0)
mse_list = []
for train_index, test_index in cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #標準化するならこの段階でやること
    #モデル学習
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [92]:
print(np.mean(mse_list))

1.0802110883943916


In [93]:
#一発で求める方法
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

# Repeated k-Fold CV k-foldCVを複数回ぶん回す

In [94]:
X

array([[16.99],
       [10.34],
       [21.01],
       [23.68],
       [24.59],
       [25.29],
       [ 8.77],
       [26.88],
       [15.04],
       [14.78],
       [10.27],
       [35.26],
       [15.42],
       [18.43],
       [14.83],
       [21.58],
       [10.33],
       [16.29],
       [16.97],
       [20.65],
       [17.92],
       [20.29],
       [15.77],
       [39.42],
       [19.82],
       [17.81],
       [13.37],
       [12.69],
       [21.7 ],
       [19.65],
       [ 9.55],
       [18.35],
       [15.06],
       [20.69],
       [17.78],
       [24.06],
       [16.31],
       [16.93],
       [18.69],
       [31.27],
       [16.04],
       [17.46],
       [13.94],
       [ 9.68],
       [30.4 ],
       [18.29],
       [22.23],
       [32.4 ],
       [28.55],
       [18.04],
       [12.54],
       [10.29],
       [34.81],
       [ 9.94],
       [25.56],
       [19.49],
       [38.01],
       [26.41],
       [11.24],
       [48.27],
       [20.29],
       [13.81],
       [

In [95]:
from sklearn.model_selection import RepeatedKFold
k = 5
n_repeats = 3
cv = RepeatedKFold(n_splits=k,n_repeats=n_repeats, random_state=0)
mse_list = []
for train_index, test_index in cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #標準化するならこの段階でやること
    #モデル学習
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [57]:
print(np.mean(mse_list))

1.0746387233165982


# テーブルデータの時は、K-Fold CVやRepeated-K-Foldを使うのが一般的

# PIPELINE

In [61]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])

In [63]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=cv)

# 回帰の評価指標　MSE,　RMSE, MAEを試す

In [69]:
#データの準備
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip'].values

#hold-out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#train
model = LinearRegression()
model.fit(X_train,y_train)

#predict
y_pred = model.predict(X_test)

In [71]:
#MSE
from sklearn.metrics import mean_squared_error as MSE
MSE(y_test, y_pred)

0.8711845537539947

In [73]:
#RMSE
MSE(y_test, y_pred, squared=False)

0.933372676777071

In [74]:
from sklearn.metrics import mean_absolute_error as MAE
MAE(y_test, y_pred)

0.6903119067790222

 # scikit-learn で決定係数を求める

In [75]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.49515102188632776

In [78]:
number_columns = df.select_dtypes(include=np.number)
number_columns.corr()

Unnamed: 0,total_bill,tip,size
total_bill,1.0,0.675734,0.598315
tip,0.675734,1.0,0.489299
size,0.598315,0.489299,1.0
