## 汎化性能と過学習

In [2]:
import seaborn as sns

In [3]:
df = sns.load_dataset('diamonds')
df = df[(df[['x', 'y', 'z']] != 0).all(axis=1)]
X = df['carat'].values.reshape(-1, 1)
y = df['price'].values

In [4]:
#テストデータと学習データに分割する方法
from sklearn.model_selection import train_test_split

# shuffleは分割前にシャッフルするか？
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True, random_state=0)

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## LOOCVについて

In [7]:
import numpy as np

df = sns.load_dataset('tips')
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip'].values

In [9]:
#LOOCVを使用
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
# for train_index, test_index in loo.split(X):
#     print("train_index:", train_index, "test_index:", test_index)

In [36]:
mse_list = []
for train_index, test_index in loo.split(X):
    
    #テストデータ１つとそれ以外の学習データに分ける
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #モデル作成
    model.fit(X_train, y_train)
    
    #予測
    y_pred = model.predict(X_test)
    
    #損失
    mse = np.mean((y_pred - y_test)**2) #データ一つしかないのに平均を取る必要ある？ ndarray => numpy.floatに変わるぐらい？？　 => 汎用的なコードにしたいからとのこと
     # mse = (y_pred - y_test)**2          
    
    mse_list.append(mse)

print(f'MSE(LOOCV):{np.mean(mse_list)}')
print(f'std:{np.std(mse_list)}')

MSE(LOOCV):1.0675673489857438
std:2.099794455177631


## k-Fold Cross Validation

In [53]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=0, shuffle=True)
model = LinearRegression()
# for train_index, test_index in cv.split(X):
#     print("train_index:", train_index, "test_index:", test_index)

In [56]:
mse_list = []
for train_index, test_index in cv.split(X):
    X_tarin, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_tarin, y_train)
    y_pred = model.predict(X_test)
    
    mse = np.mean((y_test - y_pred)**2)
    mse_list.append(mse)
print(f"MSE(5FoldCV): {np.mean(mse_list)}")
print(f"std: {np.std(mse_list)}")

MSE(5FoldCV): 1.080211088394392
std: 0.16170100507039514


## cross_val_scoreを使用した方法

In [57]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

In [59]:
np.mean(scores)

-1.080211088394392

## 回帰モデルの評価指標を一挙に解説

In [63]:
#MSE (Mean Squared Erorr)

from sklearn.metrics import mean_squared_error

df = sns.load_dataset('tips')
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_pred, y_test)

0.8711845537539947

In [64]:
# RMSE (Root Mean Squared Erorr)

mean_squared_error(y_test, y_pred, squared=False)

0.933372676777071

In [65]:
# MAE (Mean Absolute Erorr)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

0.6903119067790223

In [66]:
#決定係数

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.49515102188632776

In [71]:
np.corrcoef(df['total_bill'].values, y)

array([[1.        , 0.67573411],
       [0.67573411, 1.        ]])

In [74]:
#調整済み決定係数

r2 = r2_score(y_test, y_pred)
adj_r2 = 1-(1-r2)*(len(X_test)-1)/(len(X_test)- len(X_test[0])-1)
adj_r2

0.48813923052363783

In [75]:
import statsmodels.api as sma

#学習データでOLS
X2 = sma.add_constant(X)
est = sma.OLS(y, X2)
est_trainded = est.fit()
print(est_trainded.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.457
Model:                            OLS   Adj. R-squared:                  0.454
Method:                 Least Squares   F-statistic:                     203.4
Date:                Tue, 17 Oct 2023   Prob (F-statistic):           6.69e-34
Time:                        12:05:53   Log-Likelihood:                -350.54
No. Observations:                 244   AIC:                             705.1
Df Residuals:                     242   BIC:                             712.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9203      0.160      5.761      0.0