## Import

In [1]:
# for read data
import os
import numpy as np
import pandas as pd
import warnings; warnings.filterwarnings("ignore")

# for feature transform
# Scaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler, Normalizer, StandardScaler, MaxAbsScaler
# Transformer
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, FunctionTransformer
# Encoder
from sklearn.preprocessing import LabelEncoder

# Evaluate Transformation
from lightgbm import LGBMClassifier   
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, make_scorer

## Read Data

In [2]:
path = (os.path.abspath("./input"))

# Word2Vec, BOW는 Transfomation하지 않아 1-1에서 만든 Feature만을 Transformation한다.
feature_train = pd.read_csv(path +'/feature_train.csv', encoding='cp949')
feature_test = pd.read_csv(path +'/feature_test.csv', encoding='cp949')
y_train = pd.read_csv(path +'/y_train.csv', encoding='cp949').group

## Classify Features
feature 데이터가 Tail data로 Boosting모델을 사용할 것이다. Boosting모델 중 속도가 빠른 `LGBM`을 기준모델로 하여 <br>
여러 조합의 Feature Transformation의 Log Loss값을 보고 최적의 Transformation 조합을 찾는다.

In [3]:
classify_list = feature_train.columns.tolist()

In [4]:
# custid 제외
classify_list.remove('custid')

In [5]:
# Encoding 이전의 Category Feature
cat_uncode = feature_train.select_dtypes(include='O').columns.tolist()
cat_store = [i for i in cat_uncode if ('매장' in i) or ('지점' in i)]
cat_brd = [i for i in cat_uncode if '브랜드' in i]
cat_corner = [i for i in cat_uncode if '코너' in i]
cat_pc = [i for i in cat_uncode if '상품군' in i]
cat_part = [i for i in cat_uncode if '파트' in i]
cat_team = [i for i in cat_uncode if '팀' in i]
cat_buyer = [i for i in cat_uncode if '바이어' in i]
classify_list = [i for i in classify_list if i not in cat_uncode]

# goodcd가 numeric으로 작성되어 자동으로 Encoding된 Category Feature와 여부를 0/1로 나타낸 Category Feature
cat_encode = [i for i in classify_list if '주상품' in i] + ['1일1개']
classify_list = [i for i in classify_list if i not in cat_encode]

cat = cat_uncode + cat_encode

In [6]:
# 금액 Feature
num_amount = [i for i in classify_list if (i[-1]=="액") or (i[-3]=="액")]
classify_list = [i for i in classify_list if i not in num_amount]

# 비율 Feature
num_percent = [i for i in classify_list if (i[-1]=="율") or (i[-3]=="율") or (i[-1]=="률") or (i[-3]=="률")]
classify_list = [i for i in classify_list if i not in num_percent]

# 개수/건수 Feature
num_count = [i for i in classify_list if (i[-1]=="수") or (i[-3]=="수")]
classify_list = [i for i in classify_list if i not in num_count]

# 평균개월 등의 산출 Feature
num_calculate = classify_list

num = num_amount+num_percent+num_count+num_calculate

- **[Imputation]**

In [7]:
print('train NAN:', feature_train.isna().sum().sum(), ',', 'test NAN:',feature_test.isna().sum().sum())

train NAN: 0 , test NAN: 0


- **[Outlier]**

In [8]:
feature_train[num] = feature_train[num].apply(lambda x: x.clip(x.quantile(.1), x.quantile(.1)), axis=0)
feature_test[num] = feature_test[num].apply(lambda x: x.clip(x.quantile(.1), x.quantile(.1)), axis=0)

In [9]:
# scaler와 transformer는 LGBM을 이용해 최적의 조합을 찾는다.

- **[Categorical Feature Encoding]**<br>
  LGBM과 DNN에는 Encoding된 Feature를, Catboost에는 Unencoding된 Feature를 넣을 것이다. 분리해서 저장해야 한다.<br>
  EDA에서 저장한 Frequency Encoding을 진행해보았으나 Label Encoding이 더 성능이 좋았다.

In [9]:
# feature train과 feature test를 합쳐서 Encoding 후 custid를 기준으로 분리한다.
train_ID, test_ID = feature_train.custid, feature_test.custid
feature = pd.concat([feature_train, feature_test], ignore_index=True)

In [10]:
# Label Encoding
for i in cat_uncode:
    label = LabelEncoder()
    feature[i] = label.fit_transform(feature[i].astype(str))

In [11]:
feature_train = feature.query('custid in @train_ID').reset_index(drop=True)
feature_test = feature.query('custid in @test_ID').reset_index(drop=True)

- **[Choose Best Feature Transformation Set for LGBM]**

In [12]:
# 모델 사용을 위해 custid 제거 후 추후 concat한다.
del feature_train['custid'], feature_test['custid']

In [14]:
X_train, X_dev, y__train, y_dev = train_test_split(feature_train, y_train, test_size=0.3, random_state=0)

In [15]:
# Scaler만 사용할 때
set_score = dict()

def choose_best(scaler):
    train_x, dev_x, train_y, dev_y = X_train.copy(), X_dev.copy(), y__train.copy(), y_dev.copy()
    train_x[num] = scaler.fit_transform(train_x[num])
    dev_x[num] = scaler.transform(dev_x[num])
    
    model = LGBMClassifier(objective='multiclass', metrics='multi_logloss', num_gpu=1, random_state=0)
    model.fit(train_x, train_y)
    set_score[f'{scaler}'] = log_loss(dev_y, model.predict_proba(dev_x))    

In [16]:
for s in [MinMaxScaler(), RobustScaler(), Normalizer(), StandardScaler(), MaxAbsScaler()]:
    choose_best(s)
    
sorted([(logloss, sets) for sets, logloss in set_score.items()])[0]

(1.6479987931433973, 'MaxAbsScaler()')

In [17]:
# Transformer만 사용할 때
set_score = dict()

def choose_best(transformer):
    train_x, dev_x, train_y, dev_y = X_train.copy(), X_dev.copy(), y__train.copy(), y_dev.copy()
    train_x[num] = transformer.fit_transform(train_x[num])
    dev_x[num] = transformer.transform(dev_x[num])
    
    model = LGBMClassifier(objective='multiclass', metrics='multi_logloss', num_gpu=1, random_state=0)
    model.fit(train_x, train_y)
    set_score[f'{transformer}'] = log_loss(dev_y, model.predict_proba(dev_x))    

In [18]:
for t in [FunctionTransformer(np.log1p), PowerTransformer(), QuantileTransformer(output_distribution='normal')]:
    choose_best(t)
    
sorted([(logloss, sets) for sets, logloss in set_score.items()])[0]

(1.6479987931433973, "FunctionTransformer(func=<ufunc 'log1p'>)")

In [19]:
# Scaler와 Transformer 모두 사용할 때
set_score = dict()

def choose_best(scaler, transformer):
    train_x, dev_x, train_y, dev_y = X_train.copy(), X_dev.copy(), y__train.copy(), y_dev.copy()
    train_x[num] = scaler.fit_transform(train_x[num])
    dev_x[num] = scaler.transform(dev_x[num])
    
    train_x[num] = transformer.fit_transform(train_x[num])
    dev_x[num] = transformer.transform(dev_x[num])
    
    model = LGBMClassifier(objective='multiclass', metrics='multi_logloss', num_gpu=1, random_state=0)
    model.fit(train_x, train_y)
    set_score[f'({scaler}, {transformer})'] = log_loss(dev_y, model.predict_proba(dev_x))    

In [20]:
for s in [MinMaxScaler(), RobustScaler(), Normalizer(), StandardScaler(), MaxAbsScaler()]:
    for t in [FunctionTransformer(np.log1p), PowerTransformer(), QuantileTransformer(output_distribution='normal')]:
        choose_best(s, t)

In [21]:
# log transform만 한다.
sorted([(logloss, sets) for sets, logloss in set_score.items()])[0]

(1.6479987931433973,
 "(MaxAbsScaler(), FunctionTransformer(func=<ufunc 'log1p'>))")

- **[Scaling]**

In [22]:
# scaler = MaxAbsScaler()
# feature_train[num] = scaler.fit_transform(feature_train[num])
# feature_test[num] = scaler.transform(feature_test[num])

- **[Transform]**

In [13]:
transformer =  FunctionTransformer(np.log1p)
feature_train[num] = transformer.fit_transform(feature_train[num])
feature_test[num] = transformer.transform(feature_test[num])

### Save data

In [14]:
feature_train =  pd.concat([train_ID, feature_train], axis=1)
feature_test =  pd.concat([test_ID, feature_test], axis=1)

In [15]:
feature_train.to_csv(path+'/feature_train_transformation.csv', index=False, encoding='cp949')
feature_test.to_csv(path+'/feature_test_transformation.csv', index=False, encoding='cp949')