### **사용자 정의 함수 만들기**

In [2]:
import sklearn.base as base
import sklearn.preprocessing as skpre
import sklearn.pipeline as skpi
import sklearn.impute as skim
import sklearn.compose as skcom

### **Class**

In [21]:
class Myclass:
    def __init__(self) -> None:
        self.result = 0

    def count(self,x:int,y:int):
        self.result = x**2+y**2
        return 'okay'

In [27]:
# 상위 클래스를 계승 받아 상위클래스의 기능을 사용할 수 있다.
class MysecondClass(Myclass):
    def __init__(self) -> None:
        super().__init__()
        self.result = 0
    def square(self, x):
        self.result+=x
        return 'okay'

### **실습**

In [44]:
# sklearn호환성을 위해 base를 가져옴 -> sklearn의 baseestimator이며, transformerMixin(합치면서 활용-sklearn기반)
class Passthrough(base.BaseEstimator, base.TransformerMixin):
    # 기본 함수 정의
    def __init__(self) -> None:
        super().__init__()
        self.column_name = None
    # 학습 후 컬럼 이름을 저장해 놓는 함수 정의 (pipeline은 컬럼이름을 기억하지 못하기 때문에 저장해놓는 것이 좋다.)
    def fit(self, x):
        self.column_name=list(x.columns)            
        return self
    # 저장해 놓은 컬럼 이름을 호출하는 함수 정의
    def get_feature_names_out(self ,x=None):
        return self.column_name
    # sklearn 기반으로 TransformerMixin 할건데, 지금 함수에서는 passthrough라는 함수니까 아무것도 하지말고 input값 그대로 출력해 -> x.values
    def transform(self, x):
        return x.values                            

In [45]:
import pandas as pd
import numpy as np
df = pd.read_csv('./train.csv')
# 종속변수
target = 'TARGET'
# 연속형 변수만
num_selector = df.select_dtypes(np.number).columns.difference([target])
# 범주형 변수만
cate_selector = df.select_dtypes('object').columns.difference(['sessionID'])
drop_cols = 'sessionID'

In [46]:
passthrough = Passthrough()

In [47]:
passthrough

In [48]:
num_pipe = skpi.make_pipeline(
    skim.SimpleImputer(strategy='median'), #nan값은 중앙값으로
    skpre.StandardScaler()                 #scaling
    )

cate_pipe = skpi.make_pipeline(
    skim.SimpleImputer(strategy='most_frequent'),
    skpre.OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)   #sparse_output=False : 희소행렬은 제외
)

target_pipe = Passthrough()

total_pipe = skcom.make_column_transformer(
    (num_pipe, num_selector),
    (cate_pipe, cate_selector),
    (target_pipe, [target]),
    remainder='drop'
)

In [49]:
total_pipe

In [50]:
total_pipe.fit(df.head(500))

In [51]:
total_pipe.get_feature_names_out()

array(['pipeline-1__bounced', 'pipeline-1__duration', 'pipeline-1__new',
       'pipeline-1__quality', 'pipeline-1__transaction',
       'pipeline-1__transaction_revenue', 'pipeline-2__OS_Android',
       'pipeline-2__OS_BlackBerry', 'pipeline-2__OS_Chrome OS',
       'pipeline-2__OS_Linux', 'pipeline-2__OS_Macintosh',
       'pipeline-2__OS_Tizen', 'pipeline-2__OS_Windows',
       'pipeline-2__OS_Xbox', 'pipeline-2__OS_iOS',
       'pipeline-2__browser_Android Browser',
       'pipeline-2__browser_Android Webview',
       'pipeline-2__browser_BlackBerry', 'pipeline-2__browser_Chrome',
       'pipeline-2__browser_Coc Coc', 'pipeline-2__browser_Edge',
       'pipeline-2__browser_Firefox',
       'pipeline-2__browser_Internet Explorer',
       'pipeline-2__browser_Opera', 'pipeline-2__browser_Opera Mini',
       'pipeline-2__browser_Safari',
       'pipeline-2__browser_Safari (in-app)',
       'pipeline-2__browser_Samsung Internet',
       'pipeline-2__browser_UC Browser', 'pipeline-2__c

In [53]:
total_pipe.named_transformers_.keys()

dict_keys(['pipeline-1', 'pipeline-2', 'passthrough', 'remainder'])