In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('./data/creditcard.csv')

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# 전처리
- 이상치는 그대로
- standard scaling
- train_test set은 9:1로 split
- split전에 데이터가 시계열 순으로 나열되어 있으므로 shuffle 후에 진행

In [4]:
df_shuffled = df.iloc[np.random.permutation(df.index)].reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,141208.0,-0.549672,0.709835,0.295759,-0.38146,0.519664,-0.758593,0.844141,-0.161443,0.030325,...,-0.030154,-0.14259,0.157727,-0.038255,-0.954897,-0.695925,-0.211981,0.292587,9.99,0
1,149583.0,-0.11585,0.969169,-0.073874,0.8124,1.366801,-0.322698,1.179056,-0.132904,-0.958899,...,0.188397,0.681596,-0.354177,-0.537679,-0.007553,-0.379147,0.187612,0.168458,2.68,0
2,43336.0,1.323025,-0.84039,1.328027,-0.86369,-1.840103,-0.537218,-1.237185,-0.023234,-0.282464,...,-0.161942,0.053896,0.072817,0.475328,-0.100226,1.490852,-0.028146,0.025586,21.76,0
3,149848.0,-0.815299,0.995734,0.711254,-0.958272,0.091578,-0.311825,0.018335,0.646972,0.031721,...,-0.149705,-0.581679,-0.126189,-0.644711,-0.303184,-0.261362,0.000463,0.076526,0.77,0
4,40205.0,1.152226,-0.012603,1.287978,1.339342,-0.954464,-0.190147,-0.540572,0.075514,0.849772,...,-0.059184,0.048147,-0.008726,0.398104,0.424847,-0.409919,0.074166,0.038993,9.99,0


In [5]:
df_x = df_shuffled[df_shuffled.columns[:-1]]
df_y = df_shuffled[df_shuffled.columns[-1]]

In [6]:
scaler = StandardScaler()
df_x_scale = scaler.fit_transform(df_x)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df_x_scale, df_y, test_size=0.1, random_state=42)

# 모델 학습
불균형 데이터이므로 stratifiedKFold 사용, 5-Fold validation

- logistic regression (with penalty)
- decision tree
- random forest
- xg boost

## logistic regression (with penalty)

In [8]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)

param_grid = [{'penalty' : ['none', 'l2']},
              {'penalty' : ['elasticnet'], 'l1_ratio' : [0.5,0.25,0.75], 'solver' : ['saga']},
              {'penalty' : ['l1'], 'solver' : ['saga']}]

cross_validation = StratifiedKFold(n_splits=5)

log_grid = GridSearchCV(log_reg, param_grid, cv=cross_validation, scoring='accuracy')
log_grid.fit(x_train, y_train)



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(random_state=42),
             param_grid=[{'penalty': ['none', 'l2']},
                         {'l1_ratio': [0.5, 0.25, 0.75],
                          'penalty': ['elasticnet'], 'solver': ['saga']},
                         {'penalty': ['l1'], 'solver': ['saga']}],
             scoring='accuracy')

In [9]:
log_result = pd.DataFrame(log_grid.cv_results_)
log_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_l1_ratio,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.363641,0.113402,0.005784,0.000399,none,,,{'penalty': 'none'},0.9992,0.999181,0.999064,0.999181,0.999356,0.999196,9.3e-05,1
1,1.348791,0.180781,0.005784,0.000746,l2,,,{'penalty': 'l2'},0.999181,0.999181,0.999064,0.999181,0.999356,0.999192,9.4e-05,2
2,13.361573,0.048462,0.005573,0.000487,elasticnet,0.5,saga,"{'l1_ratio': 0.5, 'penalty': 'elasticnet', 'so...",0.999122,0.999142,0.999044,0.999181,0.999337,0.999165,9.7e-05,3
3,13.393765,0.038211,0.005978,0.000886,elasticnet,0.25,saga,"{'l1_ratio': 0.25, 'penalty': 'elasticnet', 's...",0.999122,0.999142,0.999044,0.999181,0.999337,0.999165,9.7e-05,3
4,13.35317,0.228673,0.005581,0.000486,elasticnet,0.75,saga,"{'l1_ratio': 0.75, 'penalty': 'elasticnet', 's...",0.999122,0.999142,0.999044,0.999181,0.999337,0.999165,9.7e-05,3
5,13.185924,0.090379,0.005377,0.000496,l1,,saga,"{'penalty': 'l1', 'solver': 'saga'}",0.999122,0.999142,0.999044,0.999181,0.999337,0.999165,9.7e-05,3


In [10]:
log_grid.best_params_

{'penalty': 'none'}

In [11]:
log_model = log_grid.best_estimator_
y_pred = log_model.predict(x_test)

log_cm = confusion_matrix(y_test, y_pred)
log_cm

array([[28423,     2],
       [   26,    30]], dtype=int64)

## decision tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42)

parameter_grid = {'max_depth': [5, 10, 20],
                  'max_features': [1, 5, 10, 20, 25]}

cross_validation = StratifiedKFold(n_splits=5)

tree_grid = GridSearchCV(tree, param_grid = parameter_grid,
                          cv = cross_validation)

tree_grid.fit(x_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [5, 10, 20],
                         'max_features': [1, 5, 10, 20, 25]})

In [13]:
tree_result = pd.DataFrame(tree_grid.cv_results_)
tree_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.205849,0.007715,0.007983,0.0006307299,5,1,"{'max_depth': 5, 'max_features': 1}",0.998615,0.998654,0.998674,0.998576,0.998654,0.998635,3.5e-05,15
1,0.741221,0.007205,0.007979,4.623108e-07,5,5,"{'max_depth': 5, 'max_features': 5}",0.999337,0.999493,0.999083,0.999376,0.99922,0.999302,0.00014,9
2,1.426184,0.016131,0.007779,0.0003988508,5,10,"{'max_depth': 5, 'max_features': 10}",0.999278,0.999434,0.999161,0.999415,0.999454,0.999348,0.000112,7
3,2.73708,0.011937,0.007779,0.0007464931,5,20,"{'max_depth': 5, 'max_features': 20}",0.999395,0.999551,0.999317,0.999454,0.999454,0.999434,7.7e-05,5
4,3.442397,0.006454,0.007575,0.0004854188,5,25,"{'max_depth': 5, 'max_features': 25}",0.999454,0.999571,0.999415,0.99959,0.999551,0.999516,6.9e-05,1
5,0.345276,0.008795,0.008378,0.0004883443,10,1,"{'max_depth': 10, 'max_features': 1}",0.999064,0.998986,0.998849,0.99883,0.999005,0.998947,9.1e-05,13
6,1.423985,0.008061,0.008378,0.0004971049,10,5,"{'max_depth': 10, 'max_features': 5}",0.999317,0.999454,0.999473,0.999493,0.999454,0.999438,6.2e-05,4
7,2.766605,0.034639,0.008577,0.0004886945,10,10,"{'max_depth': 10, 'max_features': 10}",0.999337,0.999512,0.999142,0.999415,0.999493,0.99938,0.000134,6
8,5.508415,0.064301,0.008382,0.0004850723,10,20,"{'max_depth': 10, 'max_features': 20}",0.999415,0.999571,0.999356,0.999395,0.999551,0.999458,8.7e-05,3
9,6.80679,0.067814,0.008174,0.0003905537,10,25,"{'max_depth': 10, 'max_features': 25}",0.999415,0.999532,0.999415,0.999454,0.999551,0.999473,5.8e-05,2


In [14]:
tree_grid.best_params_

{'max_depth': 5, 'max_features': 25}

In [15]:
tree_model = tree_grid.best_estimator_
y_pred = tree_model.predict(x_test)

tree_cm = confusion_matrix(y_test, y_pred)
tree_cm

array([[28421,     4],
       [   17,    39]], dtype=int64)

## random forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=42, n_estimators = 100, max_depth = 5, max_features=20)
forest.fit(x_train, y_train)

RandomForestClassifier(max_depth=5, max_features=20, random_state=42)

In [17]:
y_pred = forest.predict(x_test)

forest_cm = confusion_matrix(y_test, y_pred)
forest_cm

array([[28422,     3],
       [   16,    40]], dtype=int64)

## XG Boost

In [18]:
import xgboost as xgb

xgbc = xgb.XGBClassifier(random_state=42, n_estimators = 100, max_depth = 5)
xgbc.fit(x_train, y_train)
y_pred = xgbc.predict(x_test)

xgbc_cm = confusion_matrix(y_test, y_pred)
xgbc_cm





array([[28422,     3],
       [   16,    40]], dtype=int64)

## score

In [19]:
def evaluation_score(cm):
    accuracy = (cm[0,0] + cm[1,1]) / (cm[0,0]+cm[1,0]+cm[0,1]+cm[1,1])
    error_rate = 1-accuracy
    specificity = cm[1,1] / (cm[0,1] + cm[1,1])
    recall = cm[0,0] / (cm[0,0] + cm[1,0])
    precision = cm[0,0] / (cm[0,0] + cm[0,1])
    f1_score = 2 * (precision*recall) / (precision+recall)
    
    score_array = np.array([round(accuracy,6), round(error_rate,6), round(specificity,6), round(recall,6), round(precision,6), round(f1_score,6)])
    
    return score_array

In [20]:
log_score = evaluation_score(log_cm)
log_score

array([9.99017e-01, 9.83000e-04, 9.37500e-01, 9.99086e-01, 9.99930e-01,
       9.99508e-01])

In [21]:
tree_score = evaluation_score(tree_cm)
forest_score = evaluation_score(forest_cm)
xgb_score = evaluation_score(xgbc_cm)

In [22]:
df_score = pd.DataFrame([log_score, tree_score, forest_score, xgb_score], 
                        columns = ['accuracy', 'error_rate', 'specificity', 'recall', 'precision', 'f1_score'],
                       index = ['log', 'tree', 'forest', 'xgb'])
df_score

Unnamed: 0,accuracy,error_rate,specificity,recall,precision,f1_score
log,0.999017,0.000983,0.9375,0.999086,0.99993,0.999508
tree,0.999263,0.000737,0.906977,0.999402,0.999859,0.999631
forest,0.999333,0.000667,0.930233,0.999437,0.999894,0.999666
xgb,0.999333,0.000667,0.930233,0.999437,0.999894,0.999666
