In [1]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score

import pandas_profiling
import seaborn as sns

import warnings
warnings.filterwarnings('ignore') # 경고무시

In [2]:
boston = load_boston()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
data['PRICE'] = boston.target
data.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6


In [None]:
data.profile_report()

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
def ValidateScore(y_test, y_preds):
    mse = mean_squared_error(y_test, y_preds)
    rmse = np.sqrt(mse)

    print('MSE : {0:.3f}, RMSE : {1:.3f}'.format(mse, rmse))

In [5]:
y_data = data['PRICE']
X_data = data.drop(['PRICE'], axis=1, inplace=False)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=156)

In [7]:
lr = LinearRegression()
lr_normalize = LinearRegression(normalize=True)

In [9]:
lr.fit(X_train, y_train)
lr_normalize.fit(X_train, y_train)

In [10]:
y_preds = lr.predict(X_test)
y_preds_normaize = lr_normalize.predict(X_test)

In [12]:
ValidateScore(y_test, y_preds)
ValidateScore(y_test, y_preds_normaize)

MSE : 17.297, RMSE : 4.159
MSE : 17.297, RMSE : 4.159


In [14]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [19]:
def PolyLinearRegressionProcess(X_train, y_train, X_test, y_test):
    for degree in range (2, 5):
        model = Pipeline(
        [('poly', PolynomialFeatures(degree)),
        ('linear', LinearRegression(normalize=True))]
        )
        model.fit(X_train, y_train)
        print("degree : {} score(MSE / RMSE) : {} / {}".format(degree, -1*np.mean(cross_val_score(model, X_test, y_test, scoring='neg_mean_squared_error', cv=10)), np.sqrt(-1*np.mean(cross_val_score(model, X_test, y_test, scoring='neg_mean_squared_error', cv=10)))))

In [20]:
PolyLinearRegressionProcess(X_train, y_train, X_test, y_test)

degree : 2 score(MSE / RMSE) : 148.38427617055913 / 12.18130847530589
degree : 3 score(MSE / RMSE) : 1605.8474334135894 / 40.073026257241786
degree : 4 score(MSE / RMSE) : 1584.722843708967 / 39.80857751426151


In [18]:
from sklearn.linear_model import Ridge

In [25]:
alpha_set = [0, 0.1, 0.5, 1, 2, 100]
for alpha in alpha_set:
    ridge = Ridge(alpha = alpha)
    ridge.fit(X_train, y_train)
    neg_mse_scores = cross_val_score(ridge, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    avg_rmse = np.mean(np.sqrt(-1*neg_mse_scores))
    print('alpha : {} RMSE : {}'.format(alpha, avg_rmse))

alpha : 0 RMSE : 5.1624485712856725
alpha : 0.1 RMSE : 5.165443837807934
alpha : 0.5 RMSE : 5.186135867370069
alpha : 1 RMSE : 5.209166632082775
alpha : 2 RMSE : 5.237370705170855
alpha : 100 RMSE : 5.3790716302191965
