In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import category_encoders as ce

In [3]:
dr = 'data/'

sample = pd.read_csv(dr + 'sample_submission.csv')
train = pd.read_csv(dr + 'train.csv')
test = pd.read_csv(dr + 'test.csv')

In [4]:
X_train = train.copy()
X_test = test.copy()
y_train = X_train.pop('target')

# EDA

In [5]:
# 변수별 unique 개수
[(i,len(train[i].unique())) for i in train.columns]

[('id', 600000),
 ('bin_0', 3),
 ('bin_1', 3),
 ('bin_2', 3),
 ('bin_3', 3),
 ('bin_4', 3),
 ('nom_0', 4),
 ('nom_1', 7),
 ('nom_2', 7),
 ('nom_3', 7),
 ('nom_4', 5),
 ('nom_5', 1221),
 ('nom_6', 1520),
 ('nom_7', 223),
 ('nom_8', 223),
 ('nom_9', 2219),
 ('ord_0', 4),
 ('ord_1', 6),
 ('ord_2', 7),
 ('ord_3', 16),
 ('ord_4', 27),
 ('ord_5', 191),
 ('day', 8),
 ('month', 13),
 ('target', 2)]

In [6]:
# 변수별 데이터 비율 확인
for i in X_train.drop(columns=['id']).columns:
    print(i)
    print(X_train[i].value_counts(dropna=False, normalize=True), '\n')
    print(X_test[i].value_counts(dropna=False, normalize=True))
    print('ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ')

bin_0
0.0    0.880628
1.0    0.089548
NaN    0.029823
Name: bin_0, dtype: float64 

0.0    0.879443
1.0    0.090805
NaN    0.029753
Name: bin_0, dtype: float64
ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ
bin_1
0.0    0.790030
1.0    0.179965
NaN    0.030005
Name: bin_1, dtype: float64 

0.0    0.788382
1.0    0.181523
NaN    0.030095
Name: bin_1, dtype: float64
ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ
bin_2
0.0    0.699742
1.0    0.270375
NaN    0.029883
Name: bin_2, dtype: float64 

0.0    0.699995
1.0    0.270075
NaN    0.029930
Name: bin_2, dtype: float64
ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ
bin_3
F      0.610353
T      0.359623
NaN    0.030023
Name: bin_3, dtype: float64 

F      0.610230
T      0.359893
NaN    0.029878
Name: bin_3, dtype: float64
ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ
bin_4
N      0.520573
Y      0.449348
NaN    0.030078
Name: bin_4, dtype: float64 

N      0.521068
Y      0.449055
NaN    0.029878
Name: bin_4, dtype: float64
ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ
nom_0
Red      0.538810
Blue     0.343102
Green    0.087668
NaN      0.030420
Name: nom_0, dtype: float64 

Red      0.541130
Blue

# Preprocessing

In [7]:
X_train = X_train.drop(columns=['id'])
X_test = X_test.drop(columns=['id'])

In [8]:
cat_all = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 
       'nom_9','ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5','day','month']

In [9]:
# submission 제출을 위해 결측값 제거 대신 대체값(최빈값) 사용
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy="most_frequent")

X_train[cat_all] = imp.fit_transform(X_train[cat_all])
X_test[cat_all] = imp.transform(X_test[cat_all])

# ENCODING & MODELING

## OneHotEncoding

In [10]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
ohe.fit(X_train)

ohe_train = pd.DataFrame(ohe.transform(X_train))
ohe_test = pd.DataFrame(ohe.transform(X_test))

In [11]:
train_ohe = ohe.fit_transform(X_train)
test_ohe = ohe.transform(X_test)

In [12]:
# LR_sub(train_ohe, test_ohe)
# DT_sub(train_ohe, test_ohe)
# MLP_sub(train_ohe, test_ohe)

## LabelEncoding

In [13]:
from sklearn.preprocessing import LabelEncoder
all_le = pd.concat([X_train,X_test])
all_train_le, X_train_le, X_test_le, = all_le, X_train.copy(), X_test.copy()
encoder = LabelEncoder()

for i in cat_all:
    all_train_le[i+'le'] = encoder.fit_transform(all_le[i])
    X_test_le[i+'le'] = encoder.transform(X_test[i])

X_train_le = all_train_le.iloc[:600000,23:]
X_test_le =X_test_le.iloc[:,23:]

In [14]:
# LR_sub(X_train_le,X_test_le)
# DT_sub(X_train_le,X_test_le)
# MLP_sub(X_train_le, X_test_le)

## OrdinalEncoding

In [15]:
from sklearn.preprocessing import OrdinalEncoder
X_train_oe, X_test_oe, = X_train.copy(), X_test.copy()

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=int)
X_train_oe[cat_all] = oe.fit_transform(X_train[cat_all])
X_test_oe[cat_all] = oe.transform(X_test[cat_all])

In [16]:
# LR_sub(X_train_oe,X_test_oe)
# DT_sub(X_train_oe,X_test_oe)
# MLP_sub(X_train_oe, X_test_oe)

## BinaryEncoding

In [17]:
X_train_be , X_test_be = X_train.copy(), X_test.copy()
encoder = ce.BinaryEncoder()
X_train_be2 = encoder.fit_transform(X_train_be[cat_all])
X_test_be2 = encoder.fit_transform(X_test_be[cat_all])

In [18]:
# LR_sub(X_train_be2,X_test_be2)
# DT_sub(X_train_be2,X_test_be2)
# MLP_sub(X_train_be2, X_test_be2)

## FrequencyEncoding

In [19]:
X_train_fe, X_test_fe = X_train.copy(), X_test.copy()

for feat in cat_all:
    fe = X_train_fe.groupby(feat).size() / len(X_train_fe)
    X_train_fe.loc[:,feat+'_fe'] = X_train_fe[feat].map(fe)
    X_test_fe.loc[:,feat+'_fe'] = X_test_fe[feat].map(fe)
X_train_fe.drop(columns=cat_all,inplace=True)
X_test_fe.drop(columns=cat_all,inplace=True)

In [20]:
#인코딩 후에 test데이터에서 결측값이 1개 확인되어 train.mode()적용
X_test_fe['nom_6_fe'].fillna(X_train_fe['nom_6_fe'].mode()[0],inplace=True)

In [21]:
# LR_sub(X_train_fe,X_test_fe)
# DT_sub(X_train_fe,X_test_fe)
# MLP_sub(X_train_fe, X_test_fe)

## MeanEncoding

In [22]:
from category_encoders import TargetEncoder

all_df = pd.concat([X_train, X_test])
te = TargetEncoder(cols=all_df.columns, min_samples_leaf=5, smoothing=50)

X_train_te = te.fit_transform(X_train, y_train)
X_test_te = te.transform(X_test)

In [23]:
# LR_sub(X_train_te,X_test_te)
# DT_sub(X_train_te,X_test_te)
# MLP_sub(X_train_te, X_test_te)

## Weight of Evidence Encoding

In [24]:
encoder = ce.WOEEncoder()
X_train_we = encoder.fit_transform(X_train, y_train)
X_test_we = encoder.transform(X_test)

In [25]:
# LR_sub(X_train_we,X_test_le)
# DT_sub(X_train_we,X_test_le)
# MLP_sub(X_train_we, X_test_le)

## Probability Ratio Encoding

In [26]:
X_y_train = pd.concat([X_train,y_train],axis=1)
X_train_pre, X_test_pre = X_train.copy() , X_test.copy()
for feat in cat_all:
    pr = X_y_train.groupby(feat)['target'].mean()
    pr = pd.DataFrame(pr)
    pr = pr.rename(columns={'target':'good'})
    pr['bad'] = 1-pr['good']
    pr['bad'] = np.where(pr['bad']==0,0.000001,pr['bad'])
    pr['PR'] = pr['good'] / pr['bad']
    X_train_pre.loc[:,'pr_'+feat] = X_train_pre[feat].map(pr['PR'])
    X_test_pre.loc[:,'pr_'+feat] = X_train_pre[feat].map(pr['PR'])
X_train_pre.drop(columns=cat_all,inplace=True)
X_test_pre.drop(columns=cat_all,inplace=True)

In [100]:
# LR_sub(X_train_pre,X_test_pre)
# DT_sub(X_train_pre,X_test_pre)
# MLP_sub(X_train_pre, X_test_pre)

## Hashing Encoding

In [None]:
import category_encoders as ce
encoder = ce.HashingEncoder(n_components=46)

X_train_he = encoder.fit_transform(X_train)
X_test_he = encoder.transform(X_test)

In [None]:
# LR_sub(X_train_he,X_test_he)
# DT_sub(X_train_he,X_test_he)
# MLP_sub(X_train_he, X_test_he)

## Leave One Out Encoding

In [59]:
from category_encoders import LeaveOneOutEncoder 

X_train_loe, X_test_loe = X_train.copy(), X_test.copy()
loe = LeaveOneOutEncoder()

X_train_loe[cat_all]=loe.fit_transform(X_train_loe[cat_all],y_train)
X_test_loe[cat_all]=loe.transform(X_test[cat_all])

In [12]:
# LR_sub(X_train_loe,X_test_loe)
# DT_sub(X_train_loe,X_test_loe)
# MLP_sub(X_train_loe, X_test_loe)

## James-Stein Encoding

In [29]:
import category_encoders as ce
encoder = ce.JamesSteinEncoder()
X_train_jse = encoder.fit_transform(X_train, y_train)
X_test_jse = encoder.transform(X_test)

In [None]:
# LR_sub(X_train_jse,X_test_jse)
# DT_sub(X_train_jse,X_test_jse)
# MLP_sub(X_train_jse, X_test_jse)

## M-estimator Encoding

In [155]:
encoder = ce.MEstimateEncoder()
X_train_mee = encoder.fit_transform(X_train, y_train)
X_test_mee = encoder.transform(X_test)

In [156]:
# LR_sub(X_train_mee,X_test_mee)
# DT_sub(X_train_mee,X_test_mee)
# MLP_sub(X_train_mee, X_test_mee)

## CatBoost Encoding

In [112]:
encoder = ce.CatBoostEncoder()
X_train_cbe = encoder.fit_transform(X_train, y_train)
X_test_cbe = encoder.transform(X_test)

In [None]:
# LR_sub(X_train_cbe,X_test_cbe)
# DT_sub(X_train_cbe,X_test_cbe)
# MLP_sub(X_train_cbe, X_test_cbe)


## BaseN Encoding

In [111]:
bn = ce.BaseNEncoder(return_df=True,base=5)
X_train_be = bn.fit_transform(X_train)
X_test_be = bn.transform(X_test)

In [None]:
# LR_sub(X_train_bn,X_test_bn)
# DT_sub(X_train_bn,X_test_bn)
# MLP_sub(X_train_bn, X_test_bn)

##  Submission def 

In [10]:
def LR_sub(X_train,X_test):
    LR = LogisticRegression()
    LR.fit(X_train, y_train)
    predict = LR.predict_proba(X_test)[:,1]
    submission = pd.DataFrame(predict).reset_index()
    submission['index'] = submission['index']+600000
    submission.columns = ['id','target']
    submission.set_index('id')
    
    t = pd.Timestamp.now()
    fname = f"submission_LR{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv"
    submission.to_csv(fname,index=False)
    return predict

In [11]:
def DT_sub(X_train,X_test):
    DT = DecisionTreeClassifier(max_depth = 12, random_state=0)
    DT.fit(X_train, y_train)
    predict = DT.predict_proba(X_test)[:,1]
    submission = pd.DataFrame(predict).reset_index()
    submission['index'] = submission['index']+600000
    submission.columns = ['id','target']
    submission.set_index('id')
    
    t = pd.Timestamp.now()
    fname = f"submission_DT{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv"
    submission.to_csv(fname,index=False)
    return predict

In [13]:
def MLP_sub(X_train,X_test):
    MLP = MLPClassifier(hidden_layer_sizes=(50,))
    MLP.fit(X_train, y_train)
    predict = MLP.predict_proba(X_test)[:,1]
    submission = pd.DataFrame(predict).reset_index()
    submission['index'] = submission['index']+600000
    submission.columns = ['id','target']
    submission.set_index('id')
    
    t = pd.Timestamp.now()
    fname = f"submission_MLP{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv"
    submission.to_csv(fname,index=False)
    return predict