파이프 라인, 알고리즘 체인

-파이프 라인 : 코드를 한번 생성 완료해두면 계속 꺼내 쓸수 있음

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# 데이터 적재 및 분할
data = load_breast_cancer()
X = data.data; y = data.target
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, random_state=42)

In [3]:
# minmax 0 1 범위로, 데이터의 특성간의 범위를 동일하게 조정 각 특성이 동등한 중요도
# K-최근접 이웃 같은 거리 기반은 입력 데이터를 0,1 범위로 조정 성능 향상
# 신경망의 경우.. 가중치 초기화 0~1값에 초기화..
# 동일한 스케일을 가지면 시각화에도 유리..
# 단점.. 이상치에 민감하다..
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.fit_transform(X_test)
svm = SVC()
svm.fit(X_train_scaled,y_train)
svm.score(X_train_scaled,y_train), svm.score(X_test_scaled,y_test)

(0.9788732394366197, 0.972027972027972)

In [4]:
# 교차검증 및 매개 변수 튜닝
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C':[0.001, 0.01, 0.1, 1, 10, 100],
    'gamma':[0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(SVC(),param_grid, cv=5)
grid.fit(X_train_scaled, y_train)
print(f'best param : {grid.best_params_}')
print(f'best score : {grid.best_score_}')
print(f'best model test score : {grid.best_estimator_.score(X_test,y_test)}')

best param : {'C': 10, 'gamma': 0.1}
best score : 0.9765253077975377
best model test score : 0.3706293706293706


파이프 라인 구축

In [5]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svm',SVC())
])

In [6]:
pipe.fit(X_train,y_train)

In [7]:
pipe.score(X_test, y_test)

0.9790209790209791

In [8]:
# 교차검증 및 매개 변수 튜닝
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

param_grid = {
    'svm__C':[0.001, 0.01, 0.1, 1, 10, 100],
    'svm__gamma':[0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(pipe, param_grid, cv=5) # 튜닝 대상이 되는 객체
grid.fit(X_train, y_train)
print(f'best param : {grid.best_params_}')
print(f'best score : {grid.best_score_}')
print(f'best model test score : {grid.best_estimator_.score(X_test,y_test)}')

best param : {'svm__C': 100, 'svm__gamma': 0.01}
best score : 0.9765253077975377
best model test score : 0.986013986013986


또 다른 파이프라인 적용

In [9]:
import numpy as np
rnd = np.random.RandomState(seed = 0)
X = rnd.normal(size = (100,1000))
y = rnd.normal(size = (100,))

In [10]:
from sklearn.feature_selection import SelectPercentile,f_regression
# SelevtPercentile 스코어 함수를 기반으로해서 특성 중 일부를 선택 f_regression 점수를 이용해서
# 각 특성과 타켓 변수 간의 선형관계를 측정해서 주요한 특성을 선택
# 백분율 기준 percentile = 5  ->  상위 5퍼센트 특성만 선택
select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y)
X_selected = select.fit_transform(X, y)
X_selected.shape      # (100,50)

(100, 50)

In [11]:
# 교차 검증 - Rige
# X vs X_selected
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
score = cross_val_score(Ridge(), X_selected,y)
print(score.mean())
score = cross_val_score(Ridge(), X, y)
print(score.mean())

0.3454773082470686
-0.08537816697038632


In [12]:
pipe = Pipeline([
    ('select', SelectPercentile(score_func=f_regression, percentile=5)),
    ('ridge',Ridge())
])

# 교차검증 = pipe
cross_val_score(pipe, X, y).mean()

-0.7849991501567428

파이프 라인 인터페이스
- make_pipeline

In [13]:
from sklearn.pipeline import make_pipeline

# 표준
pipe = Pipeline([
    ('scaler',MinMaxScaler()),
    ('svm',SVC())
])

# 간소화
pipe_s = make_pipeline(MinMaxScaler(), SVC())

In [14]:
pipe.steps

[('scaler', MinMaxScaler()), ('svm', SVC())]

In [15]:
pipe_s.steps

[('minmaxscaler', MinMaxScaler()), ('svc', SVC())]

In [16]:
from sklearn.decomposition import PCA
# 표준화. 특성 조정(PCA), 표준화
pipe = make_pipeline(MinMaxScaler(),PCA(),MinMaxScaler() )
print(f'파이프라인 각 단계 : {pipe.steps}')

파이프라인 각 단계 : [('minmaxscaler-1', MinMaxScaler()), ('pca', PCA()), ('minmaxscaler-2', MinMaxScaler())]


In [17]:
# 파이프라인 각 단계 접근
pipe.fit(data.data)
# PCA주성분을 추출
components = pipe.named_steps['pca'].components_
components.shape

(30, 30)

그리드 서치를 적용한 파이프라인 속성에 접근

In [18]:
# Standard, logisticregression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(StandardScaler(),LogisticRegression(max_iter=500))

param_grid = {
    'logisticregression__C' : [0.001,0.01,0.1,1,10,100]
}

grid = GridSearchCV(pipe,param_grid=param_grid, cv=5)
grid.fit(data.data,data.target)

In [19]:
# 로지스틱 회귀 단계
grid.best_estimator_.named_steps['logisticregression']

In [20]:
grid.best_estimator_.named_steps['logisticregression'].coef_

array([[-0.36317072, -0.38770299, -0.35111444, -0.43555896, -0.16176493,
         0.56259548, -0.85989759, -0.96222175,  0.07616775,  0.32218585,
        -1.29103479,  0.26892526, -0.65988685, -1.01250087, -0.27717046,
         0.7362824 ,  0.11052647, -0.3335099 ,  0.29590225,  0.68091528,
        -1.02936114, -1.31459312, -0.82331706, -1.01059356, -0.67073135,
         0.04465888, -0.87330064, -0.91195815, -0.88789694, -0.47983137]])

보스턴 집

In [21]:
import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [22]:
data.shape, target.shape

((506, 13), (506,))

In [23]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge

pipe = make_pipeline(StandardScaler(), PolynomialFeatures(),Ridge() )
pipe.fit(data,target)
pipe.score(data,target)

scores = cross_val_score(pipe,data,target,cv=10)
scores

array([ 0.11685297,  0.72655465, -1.74902176,  0.65977331,  0.83057206,
        0.88349556, -0.08217666, -0.22291166, -2.83524431, -2.13746458])

In [24]:
param_gird = {
    'polynomialfeatures__degree' : [1,2,3],
    'ridge__alpha' : [0.001,0.01,0.1,1,10,100]
}
grid = GridSearchCV(pipe,param_grid=param_gird, cv=5)
grid.fit(data, target)

In [25]:
grid.best_params_, grid.best_score_

({'polynomialfeatures__degree': 2, 'ridge__alpha': 100}, 0.5103790849022662)

모델 선택을 위한 그리드 서치

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

pipe = Pipeline([
 ('preprocession',StandardScaler()),
 ('polynomialfeatures',PolynomialFeatures()),
 ('regressior', LinearRegression())
])
param_grid = [
{
    'regressior':[Ridge()],
    'preprocession':[StandardScaler()],
    'polynomialfeatures':[PolynomialFeatures()],
    'polynomialfeatures__degree' : [1,2,3],
    'regressior__alpha' : [0.001,0.01,0.1,1,10,100]
},
 {
    'regressior':[LinearRegression()],
    'preprocession':[StandardScaler()],
    'polynomialfeatures':[PolynomialFeatures()],
    'polynomialfeatures__degree' : [1,2,3],
} ,
                {
    'regressior':[RandomForestRegressor()],
    'preprocession':[None],
    'polynomialfeatures':[None],
    'regressior__max_depth' : [2,3,5,7,10],
}
]
grid = GridSearchCV(pipe,param_grid=param_grid, cv=5)
grid.fit(data, target)
grid.best_score_

0.6155262448100018

In [27]:
grid.best_score_, grid.best_params_

(0.6155262448100018,
 {'polynomialfeatures': None,
  'preprocession': None,
  'regressior': RandomForestRegressor(max_depth=10),
  'regressior__max_depth': 10})