### sklearn.preprocessing.PolynomialFeatures
- class sklearn.preprocessing.PolynomialFeatures(degree=2, *, interaction_only=False, include_bias=True, order='C')

In [3]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# 다학식으로 변환한 단항식 생성
X = np.arange(4).reshape(2,2)
print('일차 단항식의 계수 feature:\n', X)

일차 단항식의 계수 feature:
 [[0 1]
 [2 3]]


In [5]:
# degree = 2인 2차 다항식으로 변환하기 위해 PolynomialFeatures를 이용하여 변환
poly = PolynomialFeatures(degree=2)
poly.fit(X)
poly_ftr = poly.transform(X)
print('변환된 2차 다항식 계수 feature:\n', poly_ftr)

변환된 2차 다항식 계수 feature:
 [[1. 0. 1. 0. 0. 1.]
 [1. 2. 3. 4. 6. 9.]]


In [10]:
def polynomial_func(X):
    y = 1 + 2*X + X**2 + X**3
    return y
X = np.arange(4).reshape(2,2)
print('일차 단항식 계수 feature:\n', X)
print('삼차 다항식 결정값:\n', y)

일차 단항식 계수 feature:
 [[0 1]
 [2 3]]
삼차 다항식 결정값:
 [[ 1  5]
 [17 43]]


In [9]:
# 3차 다항식 변환
poly_ftr = PolynomialFeatures(degree=3).fit_transform(X)
print('3차 다항식 계수 feature: \n', poly_ftr)
# Linear Regression에 3차 다항식 계수 feature와 3차 다항식 결정값으로 학습 후 회귀 계수 확인
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(poly_ftr, y)
print('Polynomial 회귀 계수\n', np.round(model.coef_, 3))
print('Polynomial 회귀 Shape', model.coef_.shape)

3차 다항식 계수 feature: 
 [[ 1.  0.  1.  0.  0.  1.  0.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.  8. 12. 18. 27.]]
Polynomial 회귀 계수
 [[0.    0.024 0.024 0.048 0.072 0.096 0.096 0.144 0.216 0.312]
 [0.    0.057 0.057 0.114 0.171 0.228 0.228 0.342 0.514 0.742]]
Polynomial 회귀 Shape (2, 10)


In [13]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
def polynomial_func(X):
    y = 1 + 2*X + X**2 + X**3
    return y
# Pipeline 객체로 Streamline하게 Polynomial Feature변환과 Linear Regression
Pipeline(memory=None, steps=[('poly',
                              PolynomialFeatures(degree=2, include_bias=False,
                                                 interaction_only=False, order='C')),
                             ('linear',
                              LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                               positive=False))],
         verbose=False)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'PRICE']
df = pd.read_csv('./datasets/housing.csv', header=None, delimiter=r"\s+", names=column_names)

In [15]:
y_target = df['PRICE']
X_data = df.drop(['PRICE'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size=0.3, random_state=156)

In [31]:
p_model = Pipeline(memory=None, steps=[('poly',
                              PolynomialFeatures(degree=2, include_bias=False,
                                                 interaction_only=False, order='C')),
                             ('linear',
                              LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                               positive=False))],verbose=False)
p_model

In [32]:
from sklearn.metrics import r2_score, mean_squared_error
p_model.fit(X_train, y_train)
y_preds = p_model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

In [33]:
p_model.fit(X_train, y_train)
y_preds = p_model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)
print(f'MSE : {mse}, RMSE : {rmse}')
print('Varanxe score : {0:.3f}'.format(r2_score(y_test, y_preds)))

MSE : 15.55575230923985, RMSE : 3.9440781317362172
Varanxe score : 0.782


In [43]:
print('Varanxe score : {0:f}'.format(r2_score(y_test, y_preds)))

Varanxe score : 0.78166471631
