In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler

In [2]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from collections import Counter

In [3]:
%matplotlib inline

pd.set_option('display.max_columns', 50)

In [4]:
df = pd.read_csv('./data/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# 전처리

- outlier 제거 및 oversampling, undersampling 이용
- class = 1 에서 강한 상관관계를 갖는 변수만 사용해 분석 진행(V1 ~ V12, V14, V16 ~ V22) 
- 나머지는 그대로

In [5]:
df_v = df.drop(['Time', 'V13', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'], axis=1)
df_v.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V14,V15,V16,V17,V18,V19,V20,V21,V22,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,0


In [6]:
def drop_outlier(data):
    for column in data.columns[:-1]: #target column 제외
        q25, q75 = np.quantile(data[column], 0.25), np.quantile(data[column], 0.75)
        iqr = q75 - q25
        lower, upper = q25 - iqr*1.5, q75 + iqr*1.5
        
        df_no = data[data[column] > lower]
        df_no = df_no[df_no[column] < upper]
        df_no.reset_index(drop=True)
        
    return df_no

In [7]:
df_wo = drop_outlier(df_v)
df_wo.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V14,V15,V16,V17,V18,V19,V20,V21,V22,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,0


In [8]:
df_wo['Class'].value_counts()
# outlier 제거 시 class=1이 이전에 비해 덜 줄어듦

0    283026
1       464
Name: Class, dtype: int64

In [9]:
df_wo.describe()
# class=1일 때 상관관계가 낮은 변수들이 outlier의 대부분을 차지하고 있었음

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V14,V15,V16,V17,V18,V19,V20,V21,V22,Class
count,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0,283490.0
mean,0.022847,0.002753,0.010282,-0.002163,0.00281,-0.006775,0.011929,0.032275,0.000628,0.001577,0.000756,-0.002151,-0.003406,0.000659,-0.00053,-0.001475,-0.000192,0.000946,0.000783,-0.009914,0.00592,0.001637
std,1.887156,1.54749,1.486556,1.406641,1.336316,1.307465,1.139677,0.959358,1.093113,1.07121,1.019281,0.993357,0.95067,0.913537,0.871147,0.84126,0.83579,0.811539,0.692761,0.538849,0.693767,0.040424
min,-56.40751,-72.715728,-33.680984,-5.683171,-42.147898,-26.160506,-26.548144,-34.535,-9.283925,-18.271168,-4.797473,-18.683715,-19.214325,-4.391307,-14.129855,-25.162799,-9.498746,-4.932733,-28.009635,-11.263235,-2.14797,0.0
25%,-0.911114,-0.597661,-0.881447,-0.847413,-0.688505,-0.769414,-0.548962,-0.205725,-0.641554,-0.534484,-0.761254,-0.406558,-0.426436,-0.581822,-0.467345,-0.484764,-0.498565,-0.45457,-0.210548,-0.228147,-0.537982,0.0
50%,0.026714,0.063979,0.184485,-0.019039,-0.053302,-0.277172,0.042061,0.023922,-0.05135,-0.093334,-0.031315,0.138486,0.048925,0.048681,0.06597,-0.067323,-0.00445,0.004742,-0.062112,-0.02998,0.008451,0.0
75%,1.317413,0.798279,1.030384,0.741952,0.611356,0.389629,0.571003,0.32871,0.596322,0.451001,0.740057,0.615702,0.490579,0.648963,0.522189,0.397008,0.499575,0.459213,0.132641,0.184563,0.528011,0.0
max,2.45493,18.902453,4.187811,16.491217,34.801666,23.917837,44.054461,18.748872,10.392889,15.331742,12.018913,4.318071,7.692209,5.784514,8.28989,9.253526,4.295648,5.572113,23.643417,10.378272,2.133863,1.0


In [10]:
df_shuffled=df_wo.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V14,V15,V16,V17,V18,V19,V20,V21,V22,Class
0,2.091799,-0.767442,-1.195123,-0.754119,-0.632621,-0.325355,-1.091859,0.125028,-0.120063,0.253358,1.314819,-0.403043,-1.797417,-0.21456,1.848811,1.202086,0.054372,0.42403,0.028609,0.286668,0.820654,0
1,2.080972,-0.978301,-0.858798,-0.504128,-0.872309,-0.393714,-0.788675,-0.004609,-0.241086,1.045959,0.097,0.016773,0.291787,-0.397538,-1.007724,-0.472184,1.717649,-0.406685,-0.604898,-0.337934,-0.477585,0
2,-0.835629,1.324078,0.154558,1.130829,0.057534,-1.187229,0.645624,0.30799,-0.686062,-0.680586,-0.522373,-0.78238,-0.154355,1.08611,0.080934,0.97388,0.644019,0.30719,0.121676,0.064668,0.039153,0
3,-1.687972,1.579214,1.299132,-0.784778,1.265735,0.652041,1.293312,-3.084464,1.496694,2.375033,-0.602206,-0.953167,-1.357924,1.314201,-0.516707,-0.991221,-0.623162,0.070026,0.143391,1.147074,-0.831285,0
4,0.864773,-1.282571,0.4139,-0.446349,-1.398205,-0.847529,-0.224845,-0.296435,-0.88832,0.445532,-0.00237,0.059851,-0.387702,0.817923,1.298895,0.114378,-1.50974,0.426417,0.708836,0.171504,-0.188199,0


In [11]:
df_x = df_shuffled[df_shuffled.columns[:-1]]
df_y = df_shuffled[df_shuffled.columns[-1]]

scaler = StandardScaler()
df_x_scale = scaler.fit_transform(df_x)

In [12]:
over = SMOTE(sampling_strategy=0.2)
under = RandomUnderSampler(sampling_strategy=0.8)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

re_x, re_y = pipeline.fit_resample(df_x_scale, df_y)

In [13]:
re_x = pd.DataFrame(re_x)
re_x.columns = df_x.columns

re_x.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V14,V15,V16,V17,V18,V19,V20,V21,V22
count,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0,127361.0
mean,-0.925199,0.909554,-1.919317,1.394178,-0.910902,-0.546003,-1.815768,0.758355,-1.011036,-2.23305,1.679687,-2.770029,-3.361907,0.013651,-2.074547,-3.484204,-1.151004,0.387773,0.208801,0.559797,0.006697
std,2.397269,1.92581,3.635731,2.151561,2.672181,1.201486,3.88323,2.687728,1.881825,3.716696,2.680501,4.472376,4.853271,0.976149,3.856626,6.815283,2.722489,1.505143,1.090204,1.699027,0.998959
min,-19.914081,-28.848048,-19.830442,-3.849,-31.542536,-16.767239,-21.400008,-22.906509,-8.493697,-17.058075,-4.595095,-18.80653,-20.207798,-4.196091,-16.219245,-29.90913,-11.364785,-5.639772,-40.433107,-20.884032,-3.104636
25%,-1.190608,-0.126045,-2.922344,-0.162681,-1.124512,-1.120571,-2.103194,-0.198585,-1.820911,-3.73179,-0.290239,-4.670872,-6.594597,-0.569738,-3.204077,-5.538532,-1.669245,-0.496577,-0.261107,-0.283766,-0.739485
50%,-0.325352,0.529967,-0.736136,0.780336,-0.268788,-0.497999,-0.453257,0.137428,-0.566307,-0.621865,0.889423,-0.583311,-0.751652,0.044509,-0.521778,-0.504994,-0.332486,0.257802,0.036656,0.26494,0.026279
75%,0.569407,1.5074,0.30363,2.723456,0.36917,0.017596,0.274831,0.839976,0.18675,0.060016,3.129082,0.281177,0.180219,0.699763,0.402113,0.343083,0.447021,1.135567,0.537345,1.091344,0.747544
max,1.280426,10.258027,2.810211,9.010788,24.626451,17.236496,32.347337,19.509535,9.434859,12.73598,11.790834,4.143174,6.989771,6.203381,9.516686,8.489777,5.077881,6.441347,15.405576,19.217138,3.06681


In [14]:
re_y = pd.DataFrame(re_y)

re_y.value_counts()

Class
0        70756
1        56605
dtype: int64

In [15]:
x_train, x_test, y_train, y_test = train_test_split(re_x, re_y, test_size=0.1, random_state=42)

# 모델 학습

- logstic regression with penalty
- decision tree
- random forest
- adaboost
- lightgbmboost
- catboost

이후 잘 나온 모델 2개로 stacking  
최종 모델은 xgboost로 진행

## logstic regression

In [16]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)

param_grid = [{'penalty' : ['none', 'l2']},
              {'penalty' : ['elasticnet'], 'l1_ratio' : [0.5,0.25,0.75], 'solver' : ['saga']},
              {'penalty' : ['l1'], 'solver' : ['saga']}]

cross_validation = StratifiedKFold(n_splits=5)

log_grid = GridSearchCV(log_reg, param_grid, cv=cross_validation, scoring='accuracy')
log_grid.fit(x_train, y_train)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(random_state=42),
             param_grid=[{'penalty': ['none', 'l2']},
                         {'l1_ratio': [0.5, 0.25, 0.75],
                          'penalty': ['elasticnet'], 'solver': ['saga']},
                         {'penalty': ['l1'], 'solver': ['saga']}],
             scoring='accuracy')

In [17]:
log_result = pd.DataFrame(log_grid.cv_results_)
log_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_l1_ratio,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.232972,0.008475,0.004387,0.00049,none,,,{'penalty': 'none'},0.945692,0.945649,0.945125,0.942639,0.944774,0.944776,0.001122,1
1,0.211236,0.008354,0.004186,0.000398,l2,,,{'penalty': 'l2'},0.945649,0.945649,0.945125,0.942639,0.944774,0.944767,0.001115,2
2,2.610835,0.357781,0.003968,1.2e-05,elasticnet,0.5,saga,"{'l1_ratio': 0.5, 'penalty': 'elasticnet', 'so...",0.945649,0.945649,0.945125,0.942639,0.944774,0.944767,0.001115,2
3,2.617815,0.360353,0.004168,0.000399,elasticnet,0.25,saga,"{'l1_ratio': 0.25, 'penalty': 'elasticnet', 's...",0.945649,0.945649,0.945125,0.942639,0.944774,0.944767,0.001115,2
4,2.59068,0.377637,0.003578,0.000478,elasticnet,0.75,saga,"{'l1_ratio': 0.75, 'penalty': 'elasticnet', 's...",0.945649,0.945649,0.945125,0.942639,0.944774,0.944767,0.001115,2
5,2.571739,0.389317,0.003769,0.000399,l1,,saga,"{'penalty': 'l1', 'solver': 'saga'}",0.945649,0.945649,0.945125,0.942639,0.944774,0.944767,0.001115,2


In [18]:
log_grid.best_params_

{'penalty': 'none'}

In [19]:
log_model = log_grid.best_estimator_
log_y_pred = log_model.predict(x_test)

log_cm = confusion_matrix(y_test, log_y_pred)
log_cm

array([[7008,  144],
       [ 579, 5006]], dtype=int64)

## decision tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42)

parameter_grid = {'max_depth': [5, 10, 20],
                  'max_features': [1, 5, 10, 20, 25]}

cross_validation = StratifiedKFold(n_splits=5)

tree_grid = GridSearchCV(tree, param_grid = parameter_grid,
                          cv = cross_validation)

tree_grid.fit(x_train, y_train)

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 903, in fit
    super().fit(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 289, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 903, in fit
    super().fit(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 289, in fit
    raise ValueError("max_features must be in (0, n_feature

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [5, 10, 20],
                         'max_features': [1, 5, 10, 20, 25]})

In [21]:
tree_result = pd.DataFrame(tree_grid.cv_results_)
tree_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.087768,0.004677,0.005385,0.0004885583,5,1,"{'max_depth': 5, 'max_features': 1}",0.900676,0.915115,0.912497,0.913108,0.912755,0.91083,0.00516,12
1,0.298206,0.004768,0.004987,1.202538e-06,5,5,"{'max_depth': 5, 'max_features': 5}",0.934351,0.933653,0.933828,0.934046,0.934087,0.933993,0.000238,11
2,0.554913,0.003371,0.00479,0.000399997,5,10,"{'max_depth': 5, 'max_features': 10}",0.948571,0.946957,0.946041,0.945387,0.945734,0.946538,0.001143,10
3,1.080513,0.006787,0.004979,0.0006245381,5,20,"{'max_depth': 5, 'max_features': 20}",0.956728,0.951363,0.953282,0.955376,0.953019,0.953954,0.001885,8
4,0.022151,0.001588,0.0,0.0,5,25,"{'max_depth': 5, 'max_features': 25}",,,,,,,,13
5,0.126064,0.002874,0.006582,0.001196504,10,1,"{'max_depth': 10, 'max_features': 1}",0.957863,0.949836,0.959564,0.947088,0.949005,0.952671,0.005042,9
6,0.515018,0.018848,0.005789,0.0004009789,10,5,"{'max_depth': 10, 'max_features': 5}",0.96554,0.962181,0.972083,0.97169,0.973957,0.96909,0.004467,7
7,0.983169,0.026787,0.00578,0.0003970195,10,10,"{'max_depth': 10, 'max_features': 10}",0.975921,0.971341,0.978277,0.979455,0.976575,0.976314,0.002781,6
8,1.960558,0.014236,0.005186,0.0003992807,10,20,"{'max_depth': 10, 'max_features': 20}",0.984558,0.982116,0.981112,0.981985,0.981809,0.982316,0.001173,5
9,0.02075,0.000412,0.0,0.0,10,25,"{'max_depth': 10, 'max_features': 25}",,,,,,,,14


In [22]:
tree_grid.best_params_

{'max_depth': 20, 'max_features': 10}

In [23]:
tree_model = tree_grid.best_estimator_
tree_y_pred = tree_model.predict(x_test)

tree_cm = confusion_matrix(y_test, tree_y_pred)
tree_cm

array([[7090,   62],
       [  28, 5557]], dtype=int64)

## random forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=42, n_estimators = 100, max_depth = 5, max_features=20)
forest.fit(x_train, y_train)

  forest.fit(x_train, y_train)


RandomForestClassifier(max_depth=5, max_features=20, random_state=42)

In [25]:
forest_y_pred = forest.predict(x_test)

forest_cm = confusion_matrix(y_test, forest_y_pred)
forest_cm

array([[7050,  102],
       [ 419, 5166]], dtype=int64)

## adaboost

In [26]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada.fit(x_train, y_train)
ada_y_pred = ada.predict(x_test)

ada_cm = confusion_matrix(y_test, ada_y_pred)
ada_cm

  return f(*args, **kwargs)


array([[7024,  128],
       [ 271, 5314]], dtype=int64)

## lightgbmboost

In [27]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state=42)
lgbm.fit(x_train, y_train)
lgbm_y_pred = lgbm.predict(x_test)

lgbm_cm = confusion_matrix(y_test, lgbm_y_pred)
lgbm_cm

  return f(*args, **kwargs)


array([[7126,   26],
       [   4, 5581]], dtype=int64)

## catboost

In [28]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(verbose=0, n_estimators=100)
cat.fit(x_train, y_train)
cat_y_pred = cat.predict(x_test)

cat_cm = confusion_matrix(y_test, cat_y_pred)
cat_cm

array([[7131,   21],
       [   0, 5585]], dtype=int64)

## score

In [29]:
def evaluation_score(cm):
    accuracy = (cm[0,0] + cm[1,1]) / (cm[0,0]+cm[1,0]+cm[0,1]+cm[1,1])
    error_rate = 1-accuracy
    specificity = cm[1,1] / (cm[0,1] + cm[1,1])
    recall = cm[0,0] / (cm[0,0] + cm[1,0])
    precision = cm[0,0] / (cm[0,0] + cm[0,1])
    f1_score = 2 * (precision*recall) / (precision+recall)
    
    score_array = np.array([round(accuracy,6), round(error_rate,6), round(specificity,6), round(recall,6), round(precision,6), round(f1_score,6)])
    
    return score_array

In [30]:
log_score = evaluation_score(log_cm)
tree_score = evaluation_score(tree_cm)
forest_score = evaluation_score(forest_cm)
ada_score = evaluation_score(ada_cm)
lgbm_score = evaluation_score(lgbm_cm)
cat_score = evaluation_score(cat_cm)

In [31]:
df_score = pd.DataFrame([log_score, tree_score, forest_score, ada_score, lgbm_score, cat_score], 
                        columns = ['accuracy', 'error_rate', 'specificity', 'recall', 'precision', 'f1_score'],
                       index = ['log', 'tree', 'forest', 'ada', 'lbgm', 'cat'])
df_score

Unnamed: 0,accuracy,error_rate,specificity,recall,precision,f1_score
log,0.943236,0.056764,0.972039,0.923685,0.979866,0.950946
tree,0.992934,0.007066,0.988966,0.996066,0.991331,0.993693
forest,0.959096,0.040904,0.980638,0.943901,0.985738,0.964366
ada,0.968674,0.031326,0.976479,0.962851,0.982103,0.972382
lbgm,0.997645,0.002355,0.995363,0.999439,0.996365,0.997899
cat,0.998351,0.001649,0.996254,1.0,0.997064,0.99853


lbgmboosting과 catboosting의 결과가 가장 좋으므로 lbgm과 cat boosting의 예측값을 이용해 xgboost에 fit해  
최종 stacking model을 만듦

## stacking with xgboost

In [32]:
new_data = np.array([lgbm_y_pred, cat_y_pred])
new_data.shape

(2, 12737)

In [33]:
new_data = np.transpose(new_data)
new_data.shape

(12737, 2)

In [34]:
import xgboost as xgb

xgbc = xgb.XGBClassifier(random_state=42, n_estimators = 100, max_depth = 5)
xgbc.fit(new_data, y_test)
xg_y_pred = xgbc.predict(new_data)

xgbc_cm = confusion_matrix(y_test, xg_y_pred)
xgbc_cm



  return f(*args, **kwargs)


array([[7140,   12],
       [   4, 5581]], dtype=int64)

In [35]:
xg_score = evaluation_score(xgbc_cm)

df_score = pd.DataFrame([xg_score], 
                        columns = ['accuracy', 'error_rate', 'specificity', 'recall', 'precision', 'f1_score'],
                       index = ['xg'])
df_score

Unnamed: 0,accuracy,error_rate,specificity,recall,precision,f1_score
xg,0.998744,0.001256,0.997854,0.99944,0.998322,0.998881


stacking 이후 recall값만 catboost에 비해 조금 하락했을 뿐 다른 모든 지표에서 상승된 모습을 보여주었다.