In [48]:
import sklearn.preprocessing as skpre
import sklearn.impute as skim
import sklearn.pipeline as skpi
import sklearn.compose as skcom
import pandas as pd

In [49]:
df = pd.read_csv('./train.csv')
df.head()

Unnamed: 0,sessionID,userID,TARGET,browser,OS,device,new,quality,duration,bounced,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword,referral_path
0,SESSION_000000,USER_000000,17.0,Chrome,Macintosh,desktop,0,45.0,839.0,0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,
1,SESSION_000001,USER_000001,3.0,Chrome,Windows,desktop,1,1.0,39.0,0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,
2,SESSION_000002,USER_000002,1.0,Samsung Internet,Android,mobile,1,1.0,0.0,1,0.0,0.0,Asia,Southeast Asia,Malaysia,(direct),(none),,
3,SESSION_000003,USER_000003,1.0,Chrome,Macintosh,desktop,1,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,Partners,affiliate,,
4,SESSION_000004,USER_000004,1.0,Chrome,iOS,mobile,0,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,groups.google.com,referral,,Category6_Path_0000


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252289 entries, 0 to 252288
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   sessionID            252289 non-null  object 
 1   userID               252289 non-null  object 
 2   TARGET               252289 non-null  float64
 3   browser              252289 non-null  object 
 4   OS                   252289 non-null  object 
 5   device               252289 non-null  object 
 6   new                  252289 non-null  int64  
 7   quality              252289 non-null  float64
 8   duration             252289 non-null  float64
 9   bounced              252289 non-null  int64  
 10  transaction          252289 non-null  float64
 11  transaction_revenue  252289 non-null  float64
 12  continent            252289 non-null  object 
 13  subcontinent         252289 non-null  object 
 14  country              252289 non-null  object 
 15  traffic_source   

## 1. 컬럼분류

In [51]:
import numpy as np

In [52]:
df.select_dtypes(np.number).columns

Index(['TARGET', 'new', 'quality', 'duration', 'bounced', 'transaction',
       'transaction_revenue'],
      dtype='object')

In [53]:
# 종속변수
target = 'TARGET'
# 연속형 변수만
num_selector = df.select_dtypes(np.number).columns.difference([target])
# 범주형 변수만
cate_selector = df.select_dtypes('object').columns.difference(['sessionID'])

In [54]:
cate_selector

Index(['OS', 'browser', 'continent', 'country', 'device', 'keyword',
       'referral_path', 'subcontinent', 'traffic_medium', 'traffic_source',
       'userID'],
      dtype='object')

## 2. 전처리 과정 세분화

### 1) 연속형 변수

In [55]:
num_pipe = skpi.make_pipeline(
    skim.SimpleImputer(strategy='median'), #nan값은 중앙값으로
    skpre.StandardScaler()                 #scaling
    )

In [56]:
num_pipe.fit(df[num_selector])

### 2) 범주형 변수

In [57]:
cate_pipe = skpi.make_pipeline(
    skim.SimpleImputer(strategy='most_frequent'),
    skpre.OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)   #sparse_output=False : 희소행렬은 제외
)

In [58]:
cate_pipe.fit(df[cate_selector])

### 3) 합치기

In [59]:
total_pipe = skcom.make_column_transformer(
    (num_pipe, num_selector),
    (cate_pipe, cate_selector),
    remainder='passthrough'
)

In [60]:
total_pipe

In [62]:
total_pipe.fit(df.head(5000)) #메모리 문제 상 5000개만

In [63]:
total_pipe.transform(df.head(5000))

array([[-1.0610609502455806, 2.361818884833031, -1.756407504043819, ...,
        0.0, 'SESSION_000000', 17.0],
       [-1.0610609502455806, -0.21106736688091665, 0.5693439578786109,
        ..., 0.0, 'SESSION_000001', 3.0],
       [0.9424529286169205, -0.3364955716519716, 0.5693439578786109, ...,
        0.0, 'SESSION_000002', 1.0],
       ...,
       [0.9424529286169205, -0.3364955716519716, -1.756407504043819, ...,
        0.0, 'SESSION_004997', 1.0],
       [0.9424529286169205, -0.3364955716519716, 0.5693439578786109, ...,
        0.0, 'SESSION_004998', 1.0],
       [0.9424529286169205, -0.3364955716519716, 0.5693439578786109, ...,
        1.0, 'SESSION_004999', 1.0]], dtype=object)

> 컬럼 이름을 모른다...

### 4) 컬럼 이름

In [64]:
total_pipe.get_feature_names_out()

array(['pipeline-1__bounced', 'pipeline-1__duration', 'pipeline-1__new',
       ..., 'pipeline-2__userID_USER_004500', 'remainder__sessionID',
       'remainder__TARGET'], dtype=object)

In [65]:
def columns_name(x):
    x=x.replace('pipeline-1__','')
    x=x.replace('pipeline-2__','')
    x=x.replace('remainder__','')
    return x

In [66]:
col_names = list(map(columns_name, total_pipe.get_feature_names_out()))

In [68]:
data = pd.DataFrame(total_pipe.transform(df.head(5000)), columns=col_names)
data.head()

Unnamed: 0,bounced,duration,new,quality,transaction,transaction_revenue,OS_Android,OS_BlackBerry,OS_Chrome OS,OS_Linux,...,userID_USER_004493,userID_USER_004494,userID_USER_004495,userID_USER_004496,userID_USER_004497,userID_USER_004498,userID_USER_004499,userID_USER_004500,remainder__sessionID,remainder__TARGET
0,-1.061061,2.361819,-1.756408,4.688281,-0.070888,-0.029319,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SESSION_000000,17.0
1,-1.061061,-0.211067,0.569344,-0.228519,-0.070888,-0.029319,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SESSION_000001,3.0
2,0.942453,-0.336496,0.569344,-0.228519,-0.070888,-0.029319,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SESSION_000002,1.0
3,0.942453,-0.336496,0.569344,-0.228519,-0.070888,-0.029319,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SESSION_000003,1.0
4,0.942453,-0.336496,-1.756408,-0.228519,-0.070888,-0.029319,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SESSION_000004,1.0


> 파이프라인 순서대로 컬럼생성