## 다항회귀 ( polymonial regression ) 
- 선형회귀는 속성 하나하나에 독립적이기 때문에 이러 변수들 사이의 관계를 최종예측 결과에 반영하지 못한다. 
- 다항회귀를 통해 속성들 사이에 있을 수 있는 복잡한 관계들을 프로그램에 학습시킬 수 있다. 

In [51]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures # 다항 속성을 만들어 주는 툴 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error

In [30]:
boston_dataset=load_boston()

In [31]:
boston_dataset.data #boston 집들에 대한 데이터 

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [32]:
boston_dataset.data.shape #506행 13열 짜리 행렬

(506, 13)

In [33]:
boston_dataset.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [34]:
# 가상의 데이터 추가 후에 데이터프레임에 넣어줄거 

In [35]:
polynomial_transformer= PolynomialFeatures(2)  # 데이터를 다항 회귀를 위해 변형해주려고 ( 2차 다항 문제)

In [36]:
polynomial_data= polynomial_transformer.fit_transform(boston_dataset.data)

In [37]:
polynomial_data

array([[1.00000000e+00, 6.32000000e-03, 1.80000000e+01, ...,
        1.57529610e+05, 1.97656200e+03, 2.48004000e+01],
       [1.00000000e+00, 2.73100000e-02, 0.00000000e+00, ...,
        1.57529610e+05, 3.62766600e+03, 8.35396000e+01],
       [1.00000000e+00, 2.72900000e-02, 0.00000000e+00, ...,
        1.54315409e+05, 1.58310490e+03, 1.62409000e+01],
       ...,
       [1.00000000e+00, 6.07600000e-02, 0.00000000e+00, ...,
        1.57529610e+05, 2.23851600e+03, 3.18096000e+01],
       [1.00000000e+00, 1.09590000e-01, 0.00000000e+00, ...,
        1.54802902e+05, 2.54955600e+03, 4.19904000e+01],
       [1.00000000e+00, 4.74100000e-02, 0.00000000e+00, ...,
        1.57529610e+05, 3.12757200e+03, 6.20944000e+01]])

In [38]:
polynomial_data.shape # 열 105 개 ! 기존의 열(!3개)을 조합하고 가상의 열 2개 추가해서 총 105개 된거 

(506, 105)

In [46]:
polynomial_feature_names=polynomial_transformer.get_feature_names(boston_dataset.feature_names)

In [47]:
polynomial_feature_names # 가능한 모든 이차 조합 모두 다 있음 

['1',
 'CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT',
 'CRIM^2',
 'CRIM ZN',
 'CRIM INDUS',
 'CRIM CHAS',
 'CRIM NOX',
 'CRIM RM',
 'CRIM AGE',
 'CRIM DIS',
 'CRIM RAD',
 'CRIM TAX',
 'CRIM PTRATIO',
 'CRIM B',
 'CRIM LSTAT',
 'ZN^2',
 'ZN INDUS',
 'ZN CHAS',
 'ZN NOX',
 'ZN RM',
 'ZN AGE',
 'ZN DIS',
 'ZN RAD',
 'ZN TAX',
 'ZN PTRATIO',
 'ZN B',
 'ZN LSTAT',
 'INDUS^2',
 'INDUS CHAS',
 'INDUS NOX',
 'INDUS RM',
 'INDUS AGE',
 'INDUS DIS',
 'INDUS RAD',
 'INDUS TAX',
 'INDUS PTRATIO',
 'INDUS B',
 'INDUS LSTAT',
 'CHAS^2',
 'CHAS NOX',
 'CHAS RM',
 'CHAS AGE',
 'CHAS DIS',
 'CHAS RAD',
 'CHAS TAX',
 'CHAS PTRATIO',
 'CHAS B',
 'CHAS LSTAT',
 'NOX^2',
 'NOX RM',
 'NOX AGE',
 'NOX DIS',
 'NOX RAD',
 'NOX TAX',
 'NOX PTRATIO',
 'NOX B',
 'NOX LSTAT',
 'RM^2',
 'RM AGE',
 'RM DIS',
 'RM RAD',
 'RM TAX',
 'RM PTRATIO',
 'RM B',
 'RM LSTAT',
 'AGE^2',
 'AGE DIS',
 'AGE RAD',
 'AGE TAX',
 'AGE PTRATIO',
 'AGE B',
 'AGE LSTAT',
 'DI

In [48]:
# 데이터프레임 만들기 
x=pd.DataFrame(polynomial_data, columns=polynomial_feature_names)

In [49]:
x 

Unnamed: 0,1,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,...,TAX^2,TAX PTRATIO,TAX B,TAX LSTAT,PTRATIO^2,PTRATIO B,PTRATIO LSTAT,B^2,B LSTAT,LSTAT^2
0,1.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,...,87616.0,4528.8,117482.40,1474.08,234.09,6072.570,76.194,157529.6100,1976.5620,24.8004
1,1.0,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,...,58564.0,4307.6,96049.80,2211.88,316.84,7064.820,162.692,157529.6100,3627.6660,83.5396
2,1.0,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,...,58564.0,4307.6,95064.86,975.26,316.84,6992.374,71.734,154315.4089,1583.1049,16.2409
3,1.0,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,...,49284.0,4151.4,87607.86,652.68,349.69,7379.581,54.978,155732.8369,1160.2122,8.6436
4,1.0,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,...,49284.0,4151.4,88111.80,1183.26,349.69,7422.030,99.671,157529.6100,2115.4770,28.4089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,1.0,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,...,74529.0,5733.0,107013.27,2639.91,441.00,8231.790,203.070,153656.1601,3790.5433,93.5089
502,1.0,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,...,74529.0,5733.0,108353.70,2478.84,441.00,8334.900,190.680,157529.6100,3603.8520,82.4464
503,1.0,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,...,74529.0,5733.0,108353.70,1539.72,441.00,8334.900,118.440,157529.6100,2238.5160,31.8096
504,1.0,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,...,74529.0,5733.0,107411.85,1769.04,441.00,8262.450,136.080,154802.9025,2549.5560,41.9904


In [53]:
# 여기서부턴 다중선형회귀랑 완벽하게 동일 
# output 가져오기 
boston_dataset.target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [55]:
y=pd.DataFrame(boston_dataset.target, columns=['MEDIV'])

In [56]:
y

Unnamed: 0,MEDIV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


In [57]:
#dataset의 input과 output 다 있고, 학습데이터랑 테스트데이터 나누기

In [58]:
x_train, x_test, y_train, y_test= train_test_split(x,y,test_size=0.2, random_state=5)

In [59]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 105)
(102, 105)
(404, 1)
(102, 1)


In [61]:
model=LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

In [62]:
model.coef_ # theta값들 

array([[-2.55720156e-07, -5.09146958e+00, -1.65753983e-01,
        -5.97358604e+00,  2.43179237e+01,  1.65180559e+02,
         2.19910116e+01,  1.03167123e+00, -5.66895775e+00,
         3.22443249e+00, -1.10055943e-02,  5.35127787e+00,
        -4.81524408e-02,  7.53109325e-01,  2.16774682e-03,
         2.69938772e-01,  5.87901385e-01,  2.41731932e+00,
        -2.52413195e-02,  8.92859572e-02, -5.18832420e-03,
        -5.77807152e-02,  3.55602049e-01, -3.86092281e-02,
         5.43572100e-01, -3.18134358e-04,  2.40035425e-02,
        -7.48850220e-04, -7.16133309e-03, -1.06886010e-01,
        -1.27782609e+00,  2.50137719e-02,  1.14111417e-04,
        -1.25254119e-02, -4.68024813e-03,  6.05725185e-04,
        -8.57873132e-03,  1.85030053e-03, -4.64730601e-03,
         3.08484808e-02, -2.09065897e-01,  1.30035723e+00,
         3.13497405e-01,  6.72540164e-04,  7.51823883e-02,
        -7.38014891e-03,  4.23364348e-04, -6.72155118e-03,
         6.42107774e-03, -5.32275093e-03,  2.43179283e+0

In [63]:
 model.intercept_ # theta0 

array([-141.89855575])

In [64]:
# 모델 성능 평가

In [67]:
y_test_prediction=model.predict(x_test)

In [68]:
y_test_prediction

array([[42.00919669],
       [28.22372856],
       [25.52688254],
       [ 9.05454748],
       [33.5903981 ],
       [10.49656653],
       [23.05971881],
       [30.35330832],
       [24.22461029],
       [22.30153138],
       [33.11791368],
       [20.74987646],
       [20.19664017],
       [32.37012974],
       [27.35813074],
       [20.46264672],
       [13.68533392],
       [12.51890372],
       [15.88730519],
       [12.47719   ],
       [ 3.72827178],
       [20.49819423],
       [44.0935012 ],
       [23.31624023],
       [33.2791559 ],
       [ 9.43400657],
       [24.71325022],
       [21.79459244],
       [24.06481669],
       [27.42603119],
       [15.32893992],
       [ 6.80742071],
       [16.76243454],
       [13.13446141],
       [25.10746984],
       [22.92666537],
       [29.58310464],
       [10.66362648],
       [47.75889196],
       [35.24353036],
       [19.90943076],
       [15.25028015],
       [28.15702648],
       [14.02415347],
       [26.11171521],
       [28

In [69]:
# 예측값이 실제값과 얼마나 괴리가 있나 보쟈ㅑ 
#RMSE로 확인 (평균제곱근오차로 )

In [72]:
mean_squared_error(y_test, y_test_prediction)**0.5

3.1965276513721417

In [73]:
#다중선형회귀로 했을때 4. 얼마 나왔었는데 다항선형회귀의 오차가 더 적은니까 이 모델이 더 적합하다고 볼 수 있다 