# 특성 공학과 규제

- 다중 회귀 : 다른 특성 여러가지를 함께 사용(길이, 높이, 두께 etc...)
- 릿지와 랏쏘

## 특성 공학

- 기존의 특성을 사용해 새로운 특성을 뽑아내는 작업(길이 x 높이, 두께 x 높이, etc...)

In [4]:
import pandas as pd

df = pd.read_csv('data/Fish.csv')
perch_df = df.loc[df['Species'] == 'Perch']

In [5]:
perch_full = perch_df[['Length2', 'Height', 'Width']]
perch_weight = perch_df[['Weight']]

In [15]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target =\
train_test_split(perch_full, perch_weight)

In [19]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(include_bias=False) #include_bias=False는 절편 * 1에 해당하는 1값을 삭제하기 위한 옵션

In [20]:
poly.fit([[3,5]])
poly.transform([[3,5]])

array([[ 3.,  5.,  9., 15., 25.]])

In [22]:
poly = PolynomialFeatures(include_bias=False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
train_poly[:5]

array([[  28.        ,    7.8204    ,    4.2042    ,  784.        ,
         218.9712    ,  117.7176    ,   61.15865616,   32.87852568,
          17.67529764],
       [  26.5       ,    7.168     ,    4.144     ,  702.25      ,
         189.952     ,  109.816     ,   51.380224  ,   29.704192  ,
          17.172736  ],
       [  40.        ,   11.73      ,    7.225     , 1600.        ,
         469.2       ,  289.        ,  137.5929    ,   84.74925   ,
          52.200625  ],
       [  30.        ,    7.6156    ,    4.7716    ,  900.        ,
         228.468     ,  143.148     ,   57.99736336,   36.33859696,
          22.76816656],
       [  22.5       ,    5.856     ,    3.624     ,  506.25      ,
         131.76      ,   81.54      ,   34.292736  ,   21.222144  ,
          13.133376  ]])

In [23]:
poly.get_feature_names_out()

array(['Length2', 'Height', 'Width', 'Length2^2', 'Length2 Height',
       'Length2 Width', 'Height^2', 'Height Width', 'Width^2'],
      dtype=object)

In [27]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(train_poly, train_target)
lr.score(train_poly, train_target)

0.9924338144201501

In [29]:
lr.score(test_input, test_target) #poly를 안 해줘서 칼럼 개수가 달라 실행 불가



ValueError: X has 3 features, but LinearRegression is expecting 9 features as input.

In [30]:
test_poly = poly.transform(test_input)

In [31]:
lr.score(test_poly, test_target)

0.9462838074302193

a * 길이 + b + 높이 + c * 너비를 가지고, e(길이^2), + f(높이 * 길이) etc... 여러가지 특성을 만드는 작업을 했다.

In [32]:
poly = PolynomialFeatures(degree=5, include_bias=False) #하나의 변수를 degree 값에 따라 중복 적용
poly.fit(train_input)
train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)
train_poly.shape

(42, 55)

In [33]:
lr.fit(train_poly, train_target)

In [34]:
lr.score(train_poly, train_target)

0.9999999977634051

In [36]:
lr.score(test_poly, test_target) #과대적합

-15.812255881658814

- 특성 공학은 컬럼이 단순하게 많아지는 것이 중요한 게 아님
- 실제로 연관성이 있는 컬럼 조합을 추출해내는 것이 중요

# 규제

- 기울기를 줄여 보편적인 패턴의 학습을 유도하는 기능

In [43]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train_poly)

train_scaled = ss.transform(train_poly)
test_scaled = ss.transform(test_poly)

In [44]:
train_scaled[:5]

array([[-0.14578336, -0.17056674, -0.49296615, -0.26728965, -0.28581203,
        -0.44523435, -0.30519851, -0.44832702, -0.5727036 , -0.36854798,
        -0.38349291, -0.48006527, -0.39850208, -0.48760824, -0.56580009,
        -0.41303589, -0.49468214, -0.56565739, -0.62423299, -0.44684737,
        -0.45920498, -0.52275039, -0.47110436, -0.53084961, -0.58318537,
        -0.48216458, -0.53799046, -0.5864668 , -0.62652988, -0.49205211,
        -0.54383572, -0.58829047, -0.62439407, -0.65150464, -0.50293466,
        -0.51334624, -0.55712771, -0.52298137, -0.56489961, -0.60135086,
        -0.5315577 , -0.57148355, -0.60593907, -0.63422945, -0.5388305 ,
        -0.57660909, -0.60888423, -0.63494659, -0.6542904 , -0.54460137,
        -0.58005938, -0.60995187, -0.63356556, -0.65040979, -0.66026588],
       [-0.33245718, -0.41813484, -0.53070453, -0.43340588, -0.47839626,
        -0.53265817, -0.51570922, -0.56206246, -0.6021474 , -0.51026294,
        -0.53876137, -0.57148945, -0.56321406, -0.

### 릿지모델
- ax + b 에서 a^2 을 기준으로 규제 적용

### 라쏘모델
- ax + b 에서 a의 절댓값을 기준으로 규제 적용

In [72]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=465) #가장 성능 좋은 alpha 값을 직접 찾아봐야함.
ridge.fit(train_scaled, train_target)

print(ridge.score(train_scaled, train_target))
print(ridge.score(test_scaled, test_target))

0.9602133070579705
0.955402933456237


In [64]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=40) #가장 성능 좋은 alpha 값을 직접 찾아봐야함.
lasso.fit(train_scaled, train_target)

print(lasso.score(train_scaled, train_target))
print(lasso.score(test_scaled, test_target))

0.9774784710595474
0.9747033992045373


In [73]:
lasso.coef_

array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        , 131.15873426,
         3.52408489, 114.16607944,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,  10.45860083,
         0.        ,   0.        ,  39.58834559,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ])

In [75]:
ridge.coef_

array([5.43380363, 5.2924826 , 5.26854455, 5.42629567, 5.36428159,
       5.36645602, 5.25710247, 5.25331284, 5.18563365, 5.37684429,
       5.33992716, 5.34979461, 5.27408996, 5.28035408, 5.24601462,
       5.18309333, 5.18323159, 5.14363196, 5.06524451, 5.2999429 ,
       5.27856522, 5.29172128, 5.23506287, 5.24725861, 5.22956655,
       5.17037217, 5.17971647, 5.15888361, 5.10758313, 5.0856869 ,
       5.09029748, 5.06449643, 5.00799057, 4.92170678, 5.20643433,
       5.19703667, 5.21100268, 5.16907208, 5.18420632, 5.17583759,
       5.12249851, 5.1373567 , 5.12812637, 5.09426846, 5.0576193 ,
       5.07058191, 5.05889385, 5.02186214, 4.95939672, 4.9751205 ,
       4.98447572, 4.96865012, 4.92685146, 4.85903077, 4.76605913])