# Model Compare

XGBoost, AutoEncoder, Local Outlier Factor, One Class SVM, Isolation Forest 모델 비교 

테스트 데이터의 정상:이상 비율을 다르게 하여 각 테스트 데이터에서 성능 측정

In [2]:
import pandas as pd
import numpy as np
import time 

from sklearn.metrics import f1_score, auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle, resample

from xgboost import XGBClassifier
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [9]:
# XGBoost: upsampling 
def make_train_dataset(X_train, seed):
    # upsampling abnormal data
    X_train_normal = X_train[X_train.Class==0]
    X_train_abnormal = X_train[X_train.Class==1]
    
    X_abnormal_res = resample(X_train_abnormal, replace=True, n_samples=X_train_normal.shape[0], random_state=seed)
    X_train = pd.concat([X_train_normal, X_abnormal_res])
    
    y_train = X_train.Class
    X_train = X_train.drop('Class', axis=1)
    
    # shuffle 
    X_res, y_res = shuffle(X_train, y_train, random_state=seed)
    
    return X_res, y_res

In [10]:
def make_test_dataset(X_test, seed): 
    # n_anomalies: number of anomalies in X_test 
    n_anomalies = len(X_test[X_test.Class==1])
    X_test_abnormal = X_test[X_test.Class==1]

    # test dataset
    # normal:abnormal = 1:1
    X_test_normal = X_test[X_test.Class==0].sample(n_anomalies, random_state=seed)
    X_test_1_1 = pd.concat([X_test_normal, X_test_abnormal])

    # normal:abnormal = 2:1
    X_test_normal = X_test[X_test.Class==0].sample(n_anomalies*2, random_state=seed)
    X_test_2_1 = pd.concat([X_test_normal, X_test_abnormal])

    # normal:abnormal = 5:1
    X_test_normal = X_test[X_test.Class==0].sample(n_anomalies*5, random_state=seed)
    X_test_5_1 = pd.concat([X_test_normal, X_test_abnormal])

    # normal:abnormal = 10:1
    X_test_normal = X_test[X_test.Class==0].sample(n_anomalies*10, random_state=seed)
    X_test_10_1 = pd.concat([X_test_normal, X_test_abnormal])

    # normal:abnormal = 20:1
    X_test_normal = X_test[X_test.Class==0].sample(n_anomalies*20, random_state=seed)
    X_test_20_1 = pd.concat([X_test_normal, X_test_abnormal])

    # normal:abnormal = 50:1
    X_test_normal = X_test[X_test.Class==0].sample(n_anomalies*50, random_state=seed)
    X_test_50_1 = pd.concat([X_test_normal, X_test_abnormal])
    
    # shuffle 
    X_test_1_1 = shuffle(X_test_1_1, random_state=seed)
    X_test_2_1 = shuffle(X_test_2_1, random_state=seed)
    X_test_5_1 = shuffle(X_test_5_1, random_state=seed)
    X_test_10_1 = shuffle(X_test_10_1, random_state=seed)
    X_test_20_1 = shuffle(X_test_20_1, random_state=seed)
    X_test_50_1 = shuffle(X_test_50_1, random_state=seed)
    X_test = shuffle(X_test, random_state=seed)
    
    return [X_test_1_1, X_test_2_1, X_test_5_1, X_test_10_1, X_test_20_1, X_test_50_1, X_test]

In [11]:
def test_prediction(model, test_list, result):
    y_test = test_list[0].Class
    X_test = test_list[0].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_1:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_1:1'].append(auc(recall, precision))
    result['ROC_AUC_1:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[1].Class
    X_test = test_list[1].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_2:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_2:1'].append(auc(recall, precision))
    result['ROC_AUC_2:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[2].Class
    X_test = test_list[2].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_5:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_5:1'].append(auc(recall, precision))
    result['ROC_AUC_5:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[3].Class
    X_test = test_list[3].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_10:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_10:1'].append(auc(recall, precision))
    result['ROC_AUC_10:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[4].Class
    X_test = test_list[4].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_20:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_20:1'].append(auc(recall, precision))
    result['ROC_AUC_20:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[5].Class
    X_test = test_list[5].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_50:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_50:1'].append(auc(recall, precision))
    result['ROC_AUC_50:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[6].Class
    X_test = test_list[6].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_all'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_all'].append(auc(recall, precision))
    result['ROC_AUC_all'].append(roc_auc_score(y_test, preds))
    
    return result

## XGBoost

In [12]:
# XGBoost with Resample 
# F1, precision_recall_auc, roc_auc 
# every test dataset -> ratio 1:1, 2:1, ..., 50:1, original (total 7 test dataset)
# generate (n_iter x 22) dataframe 

xgb_result = {'Elapsed_Time': [],
              'F1_1:1': [], 'PRE_REC_AUC_1:1': [], 'ROC_AUC_1:1': [],
              'F1_2:1': [], 'PRE_REC_AUC_2:1': [], 'ROC_AUC_2:1': [],
              'F1_5:1': [], 'PRE_REC_AUC_5:1': [], 'ROC_AUC_5:1': [],
              'F1_10:1': [], 'PRE_REC_AUC_10:1': [], 'ROC_AUC_10:1': [],
              'F1_20:1': [], 'PRE_REC_AUC_20:1': [], 'ROC_AUC_20:1': [],
              'F1_50:1': [], 'PRE_REC_AUC_50:1': [], 'ROC_AUC_50:1': [],
              'F1_all': [], 'PRE_REC_AUC_all': [], 'ROC_AUC_all': [],
              }

# repeat: n_iter(50)
for seed in tqdm(range(50)):
    data = pd.read_csv('./creditcard.csv')
    data.drop('Time', axis=1, inplace=True)
    X_train, X_test = train_test_split(data, stratify=data.Class, test_size=0.25, random_state=seed)
    
    ss = StandardScaler()
    X_train['Amount'] = ss.fit_transform(X_train['Amount'].values.reshape(-1, 1))
    X_test['Amount'] = ss.transform(X_test['Amount'].values.reshape(-1, 1))
    
    X_res, y_res = make_train_dataset(X_train, seed)
    test_list = make_test_dataset(X_test, seed)
    
    start_time = time.time()
    
    # default hyperparameters 
    xgb = XGBClassifier(seed=seed)
    xgb.fit(X_res, y_res)
    
    xgb_result = test_prediction(xgb, test_list, xgb_result)
    
    end_time = time.time()
    xgb_result['Elapsed_Time'].append(end_time - start_time)
    
xgb_df = pd.DataFrame(xgb_result)

100%|██████████| 50/50 [04:22<00:00,  5.25s/it]


In [16]:
xgb_df.head()

Unnamed: 0,Elapsed_Time,F1_1:1,PRE_REC_AUC_1:1,ROC_AUC_1:1,F1_2:1,PRE_REC_AUC_2:1,ROC_AUC_2:1,F1_5:1,PRE_REC_AUC_5:1,ROC_AUC_5:1,...,ROC_AUC_10:1,F1_20:1,PRE_REC_AUC_20:1,ROC_AUC_20:1,F1_50:1,PRE_REC_AUC_50:1,ROC_AUC_50:1,F1_all,PRE_REC_AUC_all,ROC_AUC_all
0,2.995911,0.886878,0.949187,0.898374,0.886878,0.932249,0.898374,0.886878,0.915312,0.898374,...,0.898374,0.886878,0.903213,0.898374,0.886878,0.900367,0.898374,0.859649,0.865216,0.898325
1,2.466345,0.930435,0.96748,0.934959,0.930435,0.95664,0.934959,0.930435,0.945799,0.934959,...,0.934959,0.930435,0.938057,0.934959,0.930435,0.936235,0.934959,0.887967,0.888462,0.934882
2,2.49829,0.925764,0.965447,0.930894,0.925764,0.95393,0.930894,0.925764,0.942412,0.930894,...,0.930894,0.925764,0.934185,0.930894,0.925764,0.932249,0.930894,0.879668,0.880166,0.93081
3,2.362713,0.925764,0.965447,0.930894,0.925764,0.95393,0.930894,0.925764,0.942412,0.930894,...,0.930894,0.925764,0.934185,0.930894,0.925764,0.932249,0.930894,0.894515,0.895926,0.930838
4,2.367961,0.911504,0.95935,0.918699,0.911504,0.945799,0.918699,0.911504,0.932249,0.918699,...,0.918699,0.907489,0.917763,0.918496,0.907489,0.915486,0.918618,0.869198,0.870594,0.918622


In [14]:
xgb_df.to_csv('xgb_iter_test.csv')

## LOF

In [19]:
def test_prediction_replace(model, test_list, result):
    y_test = test_list[0].Class
    X_test = test_list[0].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    preds = pd.DataFrame(preds)
    preds = preds.replace({-1: 1, 1: 0})
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_1:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_1:1'].append(auc(recall, precision))
    result['ROC_AUC_1:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[1].Class
    X_test = test_list[1].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    preds = pd.DataFrame(preds)
    preds = preds.replace({-1: 1, 1: 0})
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_2:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_2:1'].append(auc(recall, precision))
    result['ROC_AUC_2:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[2].Class
    X_test = test_list[2].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    preds = pd.DataFrame(preds)
    preds = preds.replace({-1: 1, 1: 0})
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_5:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_5:1'].append(auc(recall, precision))
    result['ROC_AUC_5:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[3].Class
    X_test = test_list[3].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    preds = pd.DataFrame(preds)
    preds = preds.replace({-1: 1, 1: 0})
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_10:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_10:1'].append(auc(recall, precision))
    result['ROC_AUC_10:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[4].Class
    X_test = test_list[4].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    preds = pd.DataFrame(preds)
    preds = preds.replace({-1: 1, 1: 0})
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_20:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_20:1'].append(auc(recall, precision))
    result['ROC_AUC_20:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[5].Class
    X_test = test_list[5].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    preds = pd.DataFrame(preds)
    preds = preds.replace({-1: 1, 1: 0})
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_50:1'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_50:1'].append(auc(recall, precision))
    result['ROC_AUC_50:1'].append(roc_auc_score(y_test, preds))
    
    y_test = test_list[6].Class
    X_test = test_list[6].drop('Class', axis=1)
    
    preds = model.predict(X_test)
    preds = pd.DataFrame(preds)
    preds = preds.replace({-1: 1, 1: 0})
    precision, recall, _ = precision_recall_curve(y_test, preds)
    
    result['F1_all'].append(f1_score(y_test, preds))
    result['PRE_REC_AUC_all'].append(auc(recall, precision))
    result['ROC_AUC_all'].append(roc_auc_score(y_test, preds))
    
    return result

In [22]:
# LOF
# F1, precision_recall_auc, roc_auc 
# every test dataset -> ratio 1:1, 2:1, ..., 50:1, original (total 7 test dataset)
# generate (n_iter x 22) dataframe 

from sklearn.neighbors import LocalOutlierFactor

lof_result = {'Elapsed_Time': [],
              'F1_1:1': [], 'PRE_REC_AUC_1:1': [], 'ROC_AUC_1:1': [],
              'F1_2:1': [], 'PRE_REC_AUC_2:1': [], 'ROC_AUC_2:1': [],
              'F1_5:1': [], 'PRE_REC_AUC_5:1': [], 'ROC_AUC_5:1': [],
              'F1_10:1': [], 'PRE_REC_AUC_10:1': [], 'ROC_AUC_10:1': [],
              'F1_20:1': [], 'PRE_REC_AUC_20:1': [], 'ROC_AUC_20:1': [],
              'F1_50:1': [], 'PRE_REC_AUC_50:1': [], 'ROC_AUC_50:1': [],
              'F1_all': [], 'PRE_REC_AUC_all': [], 'ROC_AUC_all': [],
              }

# repeat: n_iter(50)
for seed in tqdm(range(50)):
    data = pd.read_csv('./creditcard.csv')
    data.drop('Time', axis=1, inplace=True)
    X_train, X_test = train_test_split(data, stratify=data.Class, test_size=0.25, random_state=seed)
    
    ss = StandardScaler()
    X_train['Amount'] = ss.fit_transform(X_train['Amount'].values.reshape(-1, 1))
    X_test['Amount'] = ss.transform(X_test['Amount'].values.reshape(-1, 1))
    
    y_train = X_train.Class
    X_train = X_train.drop('Class', axis=1)
    
    test_list = make_test_dataset(X_test, seed)

    start_time = time.time()
    
    lof = LocalOutlierFactor(n_neighbors=20, contamination='auto', novelty=True, n_jobs=-1)
    # train with normal data
    lof.fit(X_train[y_train==0])

    lof_result = test_prediction_replace(lof, test_list, lof_result)
    
    end_time = time.time()
    lof_result['Elapsed_Time'].append(end_time - start_time)
    
lof_df = pd.DataFrame(lof_result)

100%|██████████| 50/50 [52:52<00:00, 63.46s/it] 


In [23]:
lof_df.head()

Unnamed: 0,Elapsed_Time,F1_1:1,PRE_REC_AUC_1:1,ROC_AUC_1:1,F1_2:1,PRE_REC_AUC_2:1,ROC_AUC_2:1,F1_5:1,PRE_REC_AUC_5:1,ROC_AUC_5:1,...,ROC_AUC_10:1,F1_20:1,PRE_REC_AUC_20:1,ROC_AUC_20:1,F1_50:1,PRE_REC_AUC_50:1,ROC_AUC_50:1,F1_all,PRE_REC_AUC_all,ROC_AUC_all
0,73.316534,0.380952,0.670596,0.577236,0.361582,0.549684,0.585366,0.32,0.399527,0.593496,...,0.595935,0.191617,0.223526,0.593699,0.116152,0.174718,0.597886,0.013746,0.13425,0.598419
1,58.895219,0.390244,0.705285,0.593496,0.367816,0.567113,0.591463,0.326531,0.410912,0.596748,...,0.59878,0.205788,0.232803,0.598374,0.122371,0.177335,0.600163,0.013847,0.134277,0.598659
2,71.552189,0.411765,0.693479,0.593496,0.391061,0.574018,0.599593,0.339806,0.41274,0.603252,...,0.605285,0.20649,0.240329,0.605488,0.122807,0.18844,0.60878,0.015231,0.146807,0.611058
3,58.569276,0.360248,0.690522,0.581301,0.331429,0.524104,0.571138,0.294416,0.377518,0.581301,...,0.585772,0.189542,0.215317,0.586585,0.107011,0.159985,0.586179,0.012343,0.121715,0.585901
4,69.189393,0.425287,0.687948,0.593496,0.413408,0.597295,0.611789,0.368159,0.445852,0.617073,...,0.620325,0.228395,0.259094,0.617073,0.133333,0.200085,0.618293,0.015883,0.155089,0.618759


In [24]:
lof_df.to_csv('lof_iter_test.csv')

## AE

In [44]:
from keras import optimizers
from keras.models import Model
from keras.layers import Dense, Input, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [49]:
def test_prediction_AE(model, test_list, threshold, result):
    y_test = test_list[0].Class
    X_test = test_list[0].drop('Class', axis=1)
    
    y_pred = model.predict(X_test.values, verbose=0)
    y_dist = np.linalg.norm(X_test.values - y_pred, axis=-1)
    z = y_dist >= threshold
    y_preds = []
    for idx, is_anomaly in enumerate(z):
        if is_anomaly:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    precision, recall, _ = precision_recall_curve(y_test, y_preds)
    result['F1_1:1'].append(f1_score(y_test, y_preds))
    result['PRE_REC_AUC_1:1'].append(auc(recall, precision))
    result['ROC_AUC_1:1'].append(roc_auc_score(y_test, y_preds))
    
    y_test = test_list[1].Class
    X_test = test_list[1].drop('Class', axis=1)
    
    y_pred = model.predict(X_test.values, verbose=0)
    y_dist = np.linalg.norm(X_test.values - y_pred, axis=-1)
    z = y_dist >= threshold
    y_preds = []
    for idx, is_anomaly in enumerate(z):
        if is_anomaly:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    precision, recall, _ = precision_recall_curve(y_test, y_preds)
    result['F1_2:1'].append(f1_score(y_test, y_preds))
    result['PRE_REC_AUC_2:1'].append(auc(recall, precision))
    result['ROC_AUC_2:1'].append(roc_auc_score(y_test, y_preds))
    
    y_test = test_list[2].Class
    X_test = test_list[2].drop('Class', axis=1)
    
    y_pred = model.predict(X_test.values, verbose=0)
    y_dist = np.linalg.norm(X_test.values - y_pred, axis=-1)
    z = y_dist >= threshold
    y_preds = []
    for idx, is_anomaly in enumerate(z):
        if is_anomaly:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    precision, recall, _ = precision_recall_curve(y_test, y_preds)
    result['F1_5:1'].append(f1_score(y_test, y_preds))
    result['PRE_REC_AUC_5:1'].append(auc(recall, precision))
    result['ROC_AUC_5:1'].append(roc_auc_score(y_test, y_preds))
    
    y_test = test_list[3].Class
    X_test = test_list[3].drop('Class', axis=1)
    
    y_pred = model.predict(X_test.values, verbose=0)
    y_dist = np.linalg.norm(X_test.values - y_pred, axis=-1)
    z = y_dist >= threshold
    y_preds = []
    for idx, is_anomaly in enumerate(z):
        if is_anomaly:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    precision, recall, _ = precision_recall_curve(y_test, y_preds)
    result['F1_10:1'].append(f1_score(y_test, y_preds))
    result['PRE_REC_AUC_10:1'].append(auc(recall, precision))
    result['ROC_AUC_10:1'].append(roc_auc_score(y_test, y_preds))
    
    y_test = test_list[4].Class
    X_test = test_list[4].drop('Class', axis=1)
    
    y_pred = model.predict(X_test.values, verbose=0)
    y_dist = np.linalg.norm(X_test.values - y_pred, axis=-1)
    z = y_dist >= threshold
    y_preds = []
    for idx, is_anomaly in enumerate(z):
        if is_anomaly:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    precision, recall, _ = precision_recall_curve(y_test, y_preds)
    result['F1_20:1'].append(f1_score(y_test, y_preds))
    result['PRE_REC_AUC_20:1'].append(auc(recall, precision))
    result['ROC_AUC_20:1'].append(roc_auc_score(y_test, y_preds))
    
    y_test = test_list[5].Class
    X_test = test_list[5].drop('Class', axis=1)
    
    y_pred = model.predict(X_test.values, verbose=0)
    y_dist = np.linalg.norm(X_test.values - y_pred, axis=-1)
    z = y_dist >= threshold
    y_preds = []
    for idx, is_anomaly in enumerate(z):
        if is_anomaly:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    precision, recall, _ = precision_recall_curve(y_test, y_preds)
    result['F1_50:1'].append(f1_score(y_test, y_preds))
    result['PRE_REC_AUC_50:1'].append(auc(recall, precision))
    result['ROC_AUC_50:1'].append(roc_auc_score(y_test, y_preds))
    
    y_test = test_list[6].Class
    X_test = test_list[6].drop('Class', axis=1)
    
    y_pred = model.predict(X_test.values, verbose=0)
    y_dist = np.linalg.norm(X_test.values - y_pred, axis=-1)
    z = y_dist >= threshold
    y_preds = []
    for idx, is_anomaly in enumerate(z):
        if is_anomaly:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    precision, recall, _ = precision_recall_curve(y_test, y_preds)
    result['F1_all'].append(f1_score(y_test, y_preds))
    result['PRE_REC_AUC_all'].append(auc(recall, precision))
    result['ROC_AUC_all'].append(roc_auc_score(y_test, y_preds))
    
    return result

In [52]:
import os

def train_AE(X_train, y_train):
    encoding_dim = 16
    input_dim = X_train.shape[1]

    inputArray = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(inputArray)
    encoded = Dropout(0.3)(encoded)
    encoded = Dense(8, activation='relu')(encoded)
    encoded = Dropout(0.3)(encoded)
    encoded = Dense(4, activation='relu')(encoded)

    decoded = Dense(8, activation='relu')(encoded)
    decoded = Dense(encoding_dim, activation='relu')(decoded)
    decoded = Dense(input_dim)(decoded)

    AE = Model(inputArray, decoded)
    
    optimizer = optimizers.Adam()
    AE.compile(optimizer=optimizer,
               loss='mean_squared_error',)

    batch_size = 1024
    epochs = 30
    
    filepath = './tmp/AE_checkpoints'
           
    early_stopping_cb = EarlyStopping(patience=10, restore_best_weights=False)
    checkpoint_cb = ModelCheckpoint(filepath, monitor='val_loss', save_weights_only=True,
                                    mode='min', save_best_only=True, verbose=0)

    AE.fit(X_train[y_train==0].values,
           X_train[y_train==0].values,
           batch_size=batch_size,
           epochs=epochs,
           shuffle=True,
           verbose=0,
           validation_split=0.1,
           callbacks=[early_stopping_cb, checkpoint_cb])
    
    AE.load_weights(filepath)
    return AE

In [53]:
# AutoEncoder
# F1, precision_recall_auc, roc_auc 
# every test dataset -> ratio 1:1, 2:1, ..., 50:1, original (total 7 test dataset)
# generate (n_iter x 22) dataframe 

ae_result = {'Elapsed_Time': [],
             'F1_1:1': [], 'PRE_REC_AUC_1:1': [], 'ROC_AUC_1:1': [],
             'F1_2:1': [], 'PRE_REC_AUC_2:1': [], 'ROC_AUC_2:1': [],
             'F1_5:1': [], 'PRE_REC_AUC_5:1': [], 'ROC_AUC_5:1': [],
             'F1_10:1': [], 'PRE_REC_AUC_10:1': [], 'ROC_AUC_10:1': [],
             'F1_20:1': [], 'PRE_REC_AUC_20:1': [], 'ROC_AUC_20:1': [],
             'F1_50:1': [], 'PRE_REC_AUC_50:1': [], 'ROC_AUC_50:1': [],
             'F1_all': [], 'PRE_REC_AUC_all': [], 'ROC_AUC_all': [],
             }

# repeat: n_iter(50)
for seed in tqdm(range(50)):
    data = pd.read_csv('./creditcard.csv')
    data.drop('Time', axis=1, inplace=True)
    X_train, X_test = train_test_split(data, stratify=data.Class, test_size=0.25, random_state=seed)
    
    ss = StandardScaler()
    X_train['Amount'] = ss.fit_transform(X_train['Amount'].values.reshape(-1, 1))
    X_test['Amount'] = ss.transform(X_test['Amount'].values.reshape(-1, 1))
    
    y_train = X_train.Class
    X_train = X_train.drop('Class', axis=1)
    
    test_list = make_test_dataset(X_test, seed)

    start_time = time.time()
    
    ae = train_AE(X_train, y_train)
    y_normal_pred = ae(X_train[y_train==0].values)
    y_normal_dist = np.linalg.norm(X_train[y_train==0].values - y_normal_pred, axis=-1)
    threshold = np.quantile(y_normal_dist, 0.95)

    ae_result = test_prediction_AE(ae, test_list, threshold, ae_result)
   
    end_time = time.time()
    ae_result['Elapsed_Time'].append(end_time - start_time)
    
ae_df = pd.DataFrame(ae_result)

100%|██████████| 50/50 [18:13<00:00, 21.87s/it]


In [54]:
ae_df.head()

Unnamed: 0,Elapsed_Time,F1_1:1,PRE_REC_AUC_1:1,ROC_AUC_1:1,F1_2:1,PRE_REC_AUC_2:1,ROC_AUC_2:1,F1_5:1,PRE_REC_AUC_5:1,ROC_AUC_5:1,...,ROC_AUC_10:1,F1_20:1,PRE_REC_AUC_20:1,ROC_AUC_20:1,F1_50:1,PRE_REC_AUC_50:1,ROC_AUC_50:1,F1_all,PRE_REC_AUC_all,ROC_AUC_all
0,20.989951,0.90678,0.940931,0.910569,0.877049,0.898788,0.906504,0.795539,0.812238,0.903252,...,0.906911,0.5961,0.664751,0.90874,0.378092,0.557002,0.907642,0.055816,0.449488,0.909607
1,21.188569,0.926829,0.945122,0.926829,0.919355,0.93161,0.941057,0.860377,0.870921,0.94065,...,0.940244,0.631579,0.704653,0.938211,0.420664,0.60017,0.938618,0.060238,0.479043,0.938456
2,20.80819,0.942149,0.960699,0.943089,0.926829,0.939024,0.945122,0.838235,0.852063,0.934959,...,0.937805,0.622951,0.699725,0.937195,0.418349,0.599203,0.938374,0.058642,0.478617,0.937732
3,20.589591,0.921811,0.944309,0.922764,0.89243,0.90769,0.922764,0.82963,0.84369,0.926829,...,0.927236,0.630986,0.698793,0.930894,0.41791,0.591755,0.930813,0.058978,0.4706,0.930221
4,20.955493,0.921162,0.945949,0.922764,0.884462,0.901073,0.916667,0.795699,0.815119,0.914634,...,0.918699,0.6,0.678239,0.923577,0.398564,0.580056,0.924959,0.057543,0.466163,0.925727


In [55]:
ae_df.to_csv('ae_iter_test.csv')

## OCSVM

In [59]:
# One Class SVM
# F1, precision_recall_auc, roc_auc 
# every test dataset -> ratio 1:1, 2:1, ..., 50:1, original (total 7 test dataset)
# generate (n_iter x 22) dataframe 

from sklearn.svm import OneClassSVM

ocsvm_result = {'Elapsed_Time': [],
                'F1_1:1': [], 'PRE_REC_AUC_1:1': [], 'ROC_AUC_1:1': [],
                'F1_2:1': [], 'PRE_REC_AUC_2:1': [], 'ROC_AUC_2:1': [],
                'F1_5:1': [], 'PRE_REC_AUC_5:1': [], 'ROC_AUC_5:1': [],
                'F1_10:1': [], 'PRE_REC_AUC_10:1': [], 'ROC_AUC_10:1': [],
                'F1_20:1': [], 'PRE_REC_AUC_20:1': [], 'ROC_AUC_20:1': [],
                'F1_50:1': [], 'PRE_REC_AUC_50:1': [], 'ROC_AUC_50:1': [],
                'F1_all': [], 'PRE_REC_AUC_all': [], 'ROC_AUC_all': [],
                }

# repeat: n_iter(50)
for seed in tqdm(range(50)):
    data = pd.read_csv('./creditcard.csv')
    data.drop('Time', axis=1, inplace=True)
    X_train, X_test = train_test_split(data, stratify=data.Class, test_size=0.25, random_state=seed)
    
    ss = StandardScaler()
    X_train['Amount'] = ss.fit_transform(X_train['Amount'].values.reshape(-1, 1))
    X_test['Amount'] = ss.transform(X_test['Amount'].values.reshape(-1, 1))
    
    y_train = X_train.Class
    X_train = X_train.drop('Class', axis=1)
    
    test_list = make_test_dataset(X_test, seed)

    start_time = time.time()
    
    ocsvm = OneClassSVM(kernel='rbf', nu=0.01)
    # train with normal data
    ocsvm.fit(X_train[y_train==0])

    ocsvm_result = test_prediction_replace(ocsvm, test_list, ocsvm_result)
    
    end_time = time.time()
    ocsvm_result['Elapsed_Time'].append(end_time - start_time)
    
ocsvm_df = pd.DataFrame(ocsvm_result)

100%|██████████| 50/50 [1:33:52<00:00, 112.66s/it]


In [60]:
ocsvm_df.head()

Unnamed: 0,Elapsed_Time,F1_1:1,PRE_REC_AUC_1:1,ROC_AUC_1:1,F1_2:1,PRE_REC_AUC_2:1,ROC_AUC_2:1,F1_5:1,PRE_REC_AUC_5:1,ROC_AUC_5:1,...,ROC_AUC_10:1,F1_20:1,PRE_REC_AUC_20:1,ROC_AUC_20:1,F1_50:1,PRE_REC_AUC_50:1,ROC_AUC_50:1,F1_all,PRE_REC_AUC_all,ROC_AUC_all
0,115.599591,0.893805,0.945576,0.902439,0.882096,0.916794,0.900407,0.863248,0.880429,0.902439,...,0.904472,0.795276,0.800324,0.904472,0.660131,0.688279,0.903902,0.186691,0.463383,0.904534
1,110.62196,0.932773,0.958218,0.934959,0.92887,0.945928,0.941057,0.91358,0.92185,0.943902,...,0.943089,0.825279,0.833679,0.944106,0.713826,0.747389,0.944959,0.208451,0.510221,0.945374
2,110.645132,0.925764,0.965447,0.930894,0.925764,0.95393,0.930894,0.917749,0.933153,0.929268,...,0.927642,0.837945,0.841877,0.926016,0.731034,0.749615,0.925935,0.198131,0.48698,0.924978
3,109.392602,0.925764,0.965447,0.930894,0.909871,0.935748,0.922764,0.894515,0.907324,0.92439,...,0.92561,0.84127,0.845038,0.92622,0.754448,0.767692,0.926667,0.205825,0.489448,0.92526
4,111.895552,0.908297,0.951948,0.914634,0.908297,0.939076,0.918699,0.885106,0.899923,0.91626,...,0.915447,0.815686,0.820382,0.917073,0.69103,0.716413,0.916748,0.193669,0.477577,0.916806


In [61]:
ocsvm_df.to_csv('ocsvm_iter_test.csv')

## Isolation Forest

In [62]:
# Isolation Forest
# F1, precision_recall_auc, roc_auc 
# every test dataset -> ratio 1:1, 2:1, ..., 50:1, original (total 7 test dataset)
# generate (n_iter x 22) dataframe 

from sklearn.ensemble import IsolationForest

iso_result = {'Elapsed_Time': [],
              'F1_1:1': [], 'PRE_REC_AUC_1:1': [], 'ROC_AUC_1:1': [],
              'F1_2:1': [], 'PRE_REC_AUC_2:1': [], 'ROC_AUC_2:1': [],
              'F1_5:1': [], 'PRE_REC_AUC_5:1': [], 'ROC_AUC_5:1': [],
              'F1_10:1': [], 'PRE_REC_AUC_10:1': [], 'ROC_AUC_10:1': [],
              'F1_20:1': [], 'PRE_REC_AUC_20:1': [], 'ROC_AUC_20:1': [],
              'F1_50:1': [], 'PRE_REC_AUC_50:1': [], 'ROC_AUC_50:1': [],
              'F1_all': [], 'PRE_REC_AUC_all': [], 'ROC_AUC_all': [],
              }

# repeat: n_iter(50)
for seed in tqdm(range(50)):
    data = pd.read_csv('./creditcard.csv')
    data.drop('Time', axis=1, inplace=True)
    X_train, X_test = train_test_split(data, stratify=data.Class, test_size=0.25, random_state=seed)
    
    ss = StandardScaler()
    X_train['Amount'] = ss.fit_transform(X_train['Amount'].values.reshape(-1, 1))
    X_test['Amount'] = ss.transform(X_test['Amount'].values.reshape(-1, 1))
    
    y_train = X_train.Class
    X_train = X_train.drop('Class', axis=1)
    
    test_list = make_test_dataset(X_test, seed)

    start_time = time.time()
    
    iso = IsolationForest(n_jobs=-1, random_state=seed)
    # train with normal data
    iso.fit(X_train[y_train==0])

    iso_result = test_prediction_replace(iso, test_list, iso_result)
    
    end_time = time.time()
    iso_result['Elapsed_Time'].append(end_time - start_time)
    
iso_df = pd.DataFrame(iso_result)

100%|██████████| 50/50 [02:38<00:00,  3.16s/it]


In [63]:
iso_df.head()

Unnamed: 0,Elapsed_Time,F1_1:1,PRE_REC_AUC_1:1,ROC_AUC_1:1,F1_2:1,PRE_REC_AUC_2:1,ROC_AUC_2:1,F1_5:1,PRE_REC_AUC_5:1,ROC_AUC_5:1,...,ROC_AUC_10:1,F1_20:1,PRE_REC_AUC_20:1,ROC_AUC_20:1,F1_50:1,PRE_REC_AUC_50:1,ROC_AUC_50:1,F1_all,PRE_REC_AUC_all,ROC_AUC_all
0,0.942671,0.872247,0.927181,0.882114,0.842553,0.876924,0.876016,0.767442,0.785366,0.873171,...,0.880488,0.616822,0.657085,0.882317,0.439024,0.555267,0.883821,0.071635,0.42135,0.884558
1,0.909865,0.921162,0.945949,0.922764,0.917355,0.933866,0.934959,0.880952,0.889582,0.936585,...,0.933333,0.695925,0.736706,0.933943,0.493333,0.621901,0.933659,0.080029,0.472239,0.933352
2,0.918502,0.913793,0.951686,0.918699,0.902128,0.927144,0.918699,0.837945,0.850104,0.911382,...,0.913008,0.652308,0.696561,0.911382,0.45987,0.589054,0.912033,0.069033,0.448992,0.910902
3,0.890618,0.909871,0.947265,0.914634,0.887029,0.910826,0.910569,0.844622,0.856474,0.913008,...,0.913821,0.695082,0.725394,0.915447,0.497653,0.607167,0.914878,0.074491,0.450478,0.912485
4,0.907999,0.892704,0.934109,0.898374,0.866667,0.892954,0.896341,0.8,0.815199,0.895935,...,0.899187,0.64,0.683868,0.902846,0.425358,0.566355,0.901463,0.069218,0.440941,0.903223


In [64]:
iso_df.to_csv('iso_iter_test.csv')