In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('./data/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# 전처리
- outlier 제거 후 진행
- 나머지는 1과 동일

In [3]:
def drop_outlier(data):
    for column in data.columns[:-1]: #target column 제외
        q25, q75 = np.quantile(data[column], 0.25), np.quantile(data[column], 0.75)
        iqr = q75 - q25
        lower, upper = q25 - iqr*1.5, q75 + iqr*1.5
        
        df_no = data[data[column] > lower]
        df_no = df_no[df_no[column] < upper]
        df_no.reset_index(drop=True)
        
    return df_no

In [4]:
df_wo = drop_outlier(df)
df_wo.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0


In [5]:
df_wo.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,...,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0,252903.0
mean,94986.492228,0.094504,0.230435,0.060689,-0.01903,0.116567,-0.059286,-0.083767,0.024857,0.008731,...,-0.018557,0.002675,0.006637,0.001982,0.002301,-0.001892,0.002238,-0.001567,34.974887,0.001586
std,47594.978003,1.820271,1.293522,1.471043,1.393801,1.214682,1.264431,1.038737,1.156919,1.081381,...,0.705072,0.714132,0.433929,0.602656,0.505696,0.469089,0.371147,0.266247,42.17225,0.039788
min,0.0,-40.470142,-47.429676,-33.680984,-5.683171,-23.669726,-23.496714,-43.557242,-41.484823,-13.434066,...,-22.797604,-8.887017,-36.666,-2.836627,-7.495741,-1.732008,-9.895244,-8.478686,0.0,0.0
25%,54533.5,-0.859648,-0.397907,-0.816557,-0.850251,-0.567459,-0.793053,-0.576955,-0.196253,-0.615814,...,-0.237517,-0.544744,-0.141823,-0.35517,-0.314581,-0.322206,-0.063949,-0.054103,4.49,0.0
50%,85041.0,0.035238,0.165842,0.225603,-0.024476,0.021158,-0.318741,0.016034,0.029424,-0.045243,...,-0.047347,0.005992,-0.005033,0.040299,0.012927,-0.041085,0.006947,0.008817,16.0,0.0
75%,139586.0,1.356675,0.878353,1.067617,0.717123,0.666514,0.311231,0.515631,0.335343,0.583818,...,0.158554,0.52877,0.141507,0.439066,0.349265,0.229586,0.096233,0.072347,49.99,0.0
max,172788.0,2.45493,22.057729,4.226108,12.114672,34.099309,8.933762,6.491054,20.007208,10.392889,...,27.202839,8.361985,22.083545,4.022866,7.519589,3.463246,8.254376,22.620072,184.5,1.0


In [6]:
df_wo['Class'].value_counts()

0    252502
1       401
Name: Class, dtype: int64

모두 0으로 예측했을 때의 accuracy : 99.8414

In [7]:
df_shuffled=df_wo.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,108290.0,1.715029,0.016069,-1.880746,1.620445,0.618137,-0.537272,0.399738,-0.199777,1.350286,...,-0.213389,-0.635737,0.075981,0.542026,0.034369,-0.839976,-0.054965,-0.010526,144.0,0
1,139006.0,1.721304,0.118082,-0.805279,4.02974,0.187426,-0.143213,0.323,-0.13895,-0.611922,...,0.194038,0.491491,-0.073738,-0.0249,0.196902,0.17762,-0.057419,-0.03854,111.42,0
2,23336.0,1.109568,-0.083944,0.263154,0.120583,-0.070614,0.121725,-0.250985,0.094248,1.209889,...,-0.056006,0.045357,0.003168,-0.274889,0.156588,1.040256,-0.098048,-0.019352,37.34,0
3,169500.0,-0.636645,0.062345,1.995548,-1.163866,-0.685714,-0.279846,0.313491,-0.031192,-1.285576,...,-0.578838,-1.318619,0.049513,-0.069844,0.02096,0.438778,0.004405,0.072369,72.48,0
4,77688.0,-0.360745,1.002601,1.15811,-0.121413,0.159357,-0.495619,0.533699,0.150905,-0.48301,...,-0.245278,-0.717728,-0.057303,-0.074936,-0.180497,0.078079,0.235154,0.082631,2.69,0


In [8]:
df_x = df_shuffled[df_shuffled.columns[:-1]]
df_y = df_shuffled[df_shuffled.columns[-1]]

scaler = StandardScaler()
df_x_scale = scaler.fit_transform(df_x)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(df_x_scale, df_y, test_size=0.1, random_state=42)

# 모델 학습

- logistic regresstion with penalty
- decision tree
- random forest
- xgboost
- lightgbmboost
- catboost

## logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)

param_grid = [{'penalty' : ['none', 'l2']},
              {'penalty' : ['elasticnet'], 'l1_ratio' : [0.5,0.25,0.75], 'solver' : ['saga']},
              {'penalty' : ['l1'], 'solver' : ['saga']}]

cross_validation = StratifiedKFold(n_splits=5)

log_grid = GridSearchCV(log_reg, param_grid, cv=cross_validation, scoring='accuracy')
log_grid.fit(x_train, y_train)



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(random_state=42),
             param_grid=[{'penalty': ['none', 'l2']},
                         {'l1_ratio': [0.5, 0.25, 0.75],
                          'penalty': ['elasticnet'], 'solver': ['saga']},
                         {'penalty': ['l1'], 'solver': ['saga']}],
             scoring='accuracy')

In [11]:
log_result = pd.DataFrame(log_grid.cv_results_)
log_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_l1_ratio,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.172852,0.088522,0.005385,0.000489,none,,,{'penalty': 'none'},0.999187,0.999253,0.999275,0.999297,0.999231,0.999249,3.8e-05,1
1,1.231704,0.01663,0.006383,0.00185,l2,,,{'penalty': 'l2'},0.999187,0.999253,0.999275,0.999297,0.999231,0.999249,3.8e-05,1
2,12.33899,0.336913,0.004579,0.000482,elasticnet,0.5,saga,"{'l1_ratio': 0.5, 'penalty': 'elasticnet', 'so...",0.999143,0.999231,0.999231,0.999341,0.999165,0.999222,6.9e-05,3
3,12.129171,0.180261,0.00476,0.000759,elasticnet,0.25,saga,"{'l1_ratio': 0.25, 'penalty': 'elasticnet', 's...",0.999143,0.999231,0.999231,0.999341,0.999165,0.999222,6.9e-05,3
4,11.691932,0.04009,0.004967,0.000637,elasticnet,0.75,saga,"{'l1_ratio': 0.75, 'penalty': 'elasticnet', 's...",0.999143,0.999231,0.999231,0.999341,0.999165,0.999222,6.9e-05,3
5,11.63529,0.06916,0.004961,5e-06,l1,,saga,"{'penalty': 'l1', 'solver': 'saga'}",0.999143,0.999231,0.999231,0.999341,0.999143,0.999218,7.3e-05,6


In [12]:
log_grid.best_params_

{'penalty': 'none'}

In [13]:
log_model = log_grid.best_estimator_
y_pred = log_model.predict(x_test)

log_cm = confusion_matrix(y_test, y_pred)
log_cm

array([[25245,     5],
       [   19,    22]], dtype=int64)

## decision tree

In [14]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42)

parameter_grid = {'max_depth': [5, 10, 20],
                  'max_features': [1, 5, 10, 20, 25]}

cross_validation = StratifiedKFold(n_splits=5)

tree_grid = GridSearchCV(tree, param_grid = parameter_grid,
                          cv = cross_validation)

tree_grid.fit(x_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [5, 10, 20],
                         'max_features': [1, 5, 10, 20, 25]})

In [15]:
tree_result = pd.DataFrame(tree_grid.cv_results_)
tree_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.189898,0.007009,0.007378,0.001018,5,1,"{'max_depth': 5, 'max_features': 1}",0.998726,0.99888,0.998594,0.998968,0.998528,0.998739,0.000166,15
1,0.679183,0.019329,0.007584,0.000794,5,5,"{'max_depth': 5, 'max_features': 5}",0.999407,0.999407,0.999385,0.999429,0.999385,0.999402,1.6e-05,8
2,1.252734,0.00699,0.006984,4e-06,5,10,"{'max_depth': 5, 'max_features': 10}",0.999517,0.999451,0.999429,0.999517,0.999451,0.999473,3.7e-05,4
3,2.440667,0.009257,0.006778,0.000397,5,20,"{'max_depth': 5, 'max_features': 20}",0.999429,0.999407,0.999473,0.999297,0.99967,0.999455,0.000122,6
4,3.007161,0.010568,0.006982,0.000631,5,25,"{'max_depth': 5, 'max_features': 25}",0.999517,0.999407,0.999495,0.999385,0.999605,0.999482,7.9e-05,2
5,0.305377,0.003968,0.007979,0.000631,10,1,"{'max_depth': 10, 'max_features': 1}",0.999143,0.999165,0.999187,0.999055,0.999429,0.999196,0.000125,13
6,1.267417,0.023179,0.006978,9e-06,10,5,"{'max_depth': 10, 'max_features': 5}",0.999517,0.999451,0.999385,0.999429,0.999583,0.999473,6.9e-05,4
7,2.437074,0.014421,0.00738,0.000489,10,10,"{'max_depth': 10, 'max_features': 10}",0.999451,0.999407,0.999407,0.999495,0.999341,0.99942,5.1e-05,7
8,4.763659,0.034042,0.007577,0.000487,10,20,"{'max_depth': 10, 'max_features': 20}",0.999385,0.999473,0.999473,0.999539,0.999561,0.999486,6.2e-05,1
9,5.961448,0.05378,0.00738,0.000488,10,25,"{'max_depth': 10, 'max_features': 25}",0.999517,0.999319,0.999473,0.999539,0.999539,0.999477,8.3e-05,3


In [16]:
tree_grid.best_params_

{'max_depth': 10, 'max_features': 20}

In [17]:
tree_model = tree_grid.best_estimator_
y_pred = tree_model.predict(x_test)

tree_cm = confusion_matrix(y_test, y_pred)
tree_cm

array([[25246,     4],
       [    9,    32]], dtype=int64)

## random forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=42, n_estimators = 100, max_depth = 5, max_features=20)
forest.fit(x_train, y_train)

RandomForestClassifier(max_depth=5, max_features=20, random_state=42)

In [19]:
y_pred = forest.predict(x_test)

forest_cm = confusion_matrix(y_test, y_pred)
forest_cm

array([[25248,     2],
       [   11,    30]], dtype=int64)

## xgboost

In [20]:
import xgboost as xgb

xgbc = xgb.XGBClassifier(random_state=42, n_estimators = 100, max_depth = 5)
xgbc.fit(x_train, y_train)
y_pred = xgbc.predict(x_test)

xgbc_cm = confusion_matrix(y_test, y_pred)
xgbc_cm





array([[25247,     3],
       [   10,    31]], dtype=int64)

## lightgbmboost

In [21]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state=42)
lgbm.fit(x_train, y_train)
y_pred = lgbm.predict(x_test)

lgbm_cm = confusion_matrix(y_test, y_pred)
lgbm_cm

array([[25130,   120],
       [   30,    11]], dtype=int64)

## catboost

In [22]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(verbose=0, n_estimators=100)
cat.fit(x_train, y_train)
y_pred = cat.predict(x_test)

cat_cm = confusion_matrix(y_test, y_pred)
cat_cm

array([[25249,     1],
       [   10,    31]], dtype=int64)

## score

In [23]:
def evaluation_score(cm):
    accuracy = (cm[0,0] + cm[1,1]) / (cm[0,0]+cm[1,0]+cm[0,1]+cm[1,1])
    error_rate = 1-accuracy
    specificity = cm[1,1] / (cm[0,1] + cm[1,1])
    recall = cm[0,0] / (cm[0,0] + cm[1,0])
    precision = cm[0,0] / (cm[0,0] + cm[0,1])
    f1_score = 2 * (precision*recall) / (precision+recall)
    
    score_array = np.array([round(accuracy,6), round(error_rate,6), round(specificity,6), round(recall,6), round(precision,6), round(f1_score,6)])
    
    return score_array

In [24]:
log_score = evaluation_score(log_cm)
tree_score = evaluation_score(tree_cm)
forest_score = evaluation_score(forest_cm)
xgb_score = evaluation_score(xgbc_cm)
lgbm_score = evaluation_score(lgbm_cm)
cat_score = evaluation_score(cat_cm)

In [26]:
df_score = pd.DataFrame([log_score, tree_score, forest_score, xgb_score, lgbm_score, cat_score], 
                        columns = ['accuracy', 'error_rate', 'specificity', 'recall', 'precision', 'f1_score'],
                       index = ['log', 'tree', 'forest', 'xgb', 'lbgm', 'cat'])
df_score

Unnamed: 0,accuracy,error_rate,specificity,recall,precision,f1_score
log,0.999051,0.000949,0.814815,0.999248,0.999802,0.999525
tree,0.999486,0.000514,0.888889,0.999644,0.999842,0.999743
forest,0.999486,0.000514,0.9375,0.999565,0.999921,0.999743
xgb,0.999486,0.000514,0.911765,0.999604,0.999881,0.999743
lbgm,0.994069,0.005931,0.083969,0.998808,0.995248,0.997024
cat,0.999565,0.000435,0.96875,0.999604,0.99996,0.999782
