## 데이터 불러오기

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/val.csv')
test_df = pd.read_csv('./data/test.csv')

In [2]:
# val_df 의 오염도 계산
# Class=0 정상거래, Class=1 오염거래
val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal
print(f'Validation contamination : [{val_contamination}]')

Validation contamination : [0.0010551491277433877]


In [3]:
train_df.drop(columns=['ID']).describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
count,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,...,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0
mean,0.000197,0.001289,0.009717,-0.004169,0.000475,0.005141,0.005769,-0.002451,-0.002107,0.005209,...,-0.001242,4.088347e-07,-0.001317,-0.000884,0.00168,-0.000293,-0.000234,-0.000508,0.927253,0.116232
std,1.95106,1.651064,1.496916,1.412633,1.367533,1.330583,1.204111,1.185504,1.095415,1.071337,...,0.722001,0.7238291,0.636061,0.605854,0.520069,0.480979,0.399505,0.35613,3.412933,0.558161
min,-56.40751,-72.715728,-32.454198,-5.600607,-42.147898,-26.160506,-41.506796,-50.943369,-13.434066,-24.403185,...,-22.75754,-8.887017,-44.807735,-2.824849,-10.295397,-1.855355,-9.895244,-9.617915,-0.307413,-0.994972
25%,-0.923479,-0.595602,-0.883877,-0.853728,-0.689853,-0.766094,-0.552071,-0.209492,-0.647477,-0.533477,...,-0.22971,-0.5402665,-0.16218,-0.355582,-0.31547,-0.32616,-0.070847,-0.053249,-0.23056,-0.360304
50%,0.012074,0.06639,0.183868,-0.019359,-0.05406,-0.272436,0.039036,0.02097,-0.052157,-0.09081,...,-0.030281,0.008345807,-0.012261,0.040573,0.018278,-0.052815,0.001502,0.011158,-0.000699,-0.00259
75%,1.315373,0.801687,1.03712,0.742208,0.614214,0.405285,0.56875,0.328303,0.590705,0.455287,...,0.186001,0.5287508,0.147474,0.438225,0.353989,0.240838,0.091279,0.077851,0.768532,0.640653
max,2.45493,21.467203,4.187811,16.491217,34.801666,23.917837,44.054461,20.007208,10.392889,15.331742,...,27.202839,8.361985,22.528412,4.022866,7.519589,3.119295,11.13574,33.847808,180.101027,1.034951


## 정규화

In [4]:
#StandardScaler : 평균 0, 표준편차 1 기준 정규화
from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler()
train_df_std = scaler_std.fit_transform(train_df.drop(columns=['ID']))
train_df_std = pd.DataFrame(train_df_std, columns = ['V1', 'V2','V3','V4', 'V5','V6','V7', 'V8','V9','V10', 'V11','V12','V13', 'V14','V15','V16', 'V17','V18','V19', 'V20','V21','V22', 'V23','V24','V25', 'V26','V27','V28', 'V29','V30'])
train_df_std.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
count,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,...,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0
mean,9.98637e-18,-6.990459e-18,3.994548e-17,-3.994548e-17,1.5978190000000002e-17,5.991822e-18,1.5978190000000002e-17,3.994548e-18,3.432815e-18,1.672717e-17,...,-7.489778e-18,-2.995911e-18,-1.5978190000000002e-17,-8.488415e-18,2.3967290000000002e-17,2.546524e-17,8.738074e-18,5.991822e-18,2.9959110000000004e-17,1.597819e-16
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,...,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004
min,-28.91143,-44.04272,-21.6873,-3.961724,-30.82086,-19.66489,-34.47585,-42.97002,-12.26203,-22.78321,...,-31.5185,-12.27784,-70.44391,-4.661148,-19.79951,-3.856866,-24.76825,-27.00547,-0.3617625,-1.990839
25%,-0.4734248,-0.3615209,-0.5969594,-0.6014037,-0.5048002,-0.5796248,-0.4632818,-0.1746449,-0.5891582,-0.5028191,...,-0.3164376,-0.7464045,-0.252906,-0.5854532,-0.609826,-0.6775104,-0.1767528,-0.1480942,-0.3392442,-0.8537647
50%,0.006087142,0.03943019,0.11634,-0.01075321,-0.03987825,-0.2086138,0.02762811,0.0197556,-0.04569051,-0.08962586,...,-0.04022034,0.01152957,-0.0172053,0.06842869,0.03191485,-0.1091987,0.004344384,0.03275862,-0.2718939,-0.2128823
75%,0.6740856,0.4847794,0.6863496,0.528361,0.4487943,0.3007295,0.4675513,0.2789995,0.5411779,0.4201101,...,0.2593411,0.7304939,0.2339279,0.7247792,0.6774301,0.5013363,0.2290662,0.2200319,-0.04650593,0.9395568
max,1.258159,13.00132,2.791147,11.6771,25.44826,17.97167,36.58209,16.87868,9.489591,14.30605,...,37.67888,11.55248,35.42087,6.641477,14.45565,6.485947,27.87452,95.04534,52.49871,1.645983


In [5]:
#RobustScaler : 중앙값 0, 사분위수 IQR 기준 정규화
from sklearn.preprocessing import RobustScaler

scaler_robust = RobustScaler()
train_df_robust = scaler_robust.fit_transform(train_df.drop(columns=['ID']))
train_df_robust = pd.DataFrame(train_df_robust, columns = ['V1', 'V2','V3','V4', 'V5','V6','V7', 'V8','V9','V10', 'V11','V12','V13', 'V14','V15','V16', 'V17','V18','V19', 'V20','V21','V22', 'V23','V24','V25', 'V26','V27','V28', 'V29','V30'])
train_df_robust.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
count,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,...,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0,113842.0
mean,-0.005305,-0.04659126,-0.090656,0.009518101,0.04181888,0.236966,-0.02968106,-0.04354864,0.040422,0.09711,...,0.06985382,-0.007807,0.0353412,-0.052226,-0.024793,0.09263181,-0.010705,-0.08898751,0.928795,0.1187084
std,0.871456,1.181619,0.779239,0.885144,1.048668,1.135911,1.074311,2.204379,0.884697,1.083511,...,1.736786,0.677098,2.054098,0.763227,0.77685,0.8482905,2.464164,2.716472,3.416036,0.5576269
min,-25.200226,-52.08808,-16.990171,-3.497162,-32.2789,-22.100505,-37.06731,-94.76536,-10.80771,-24.58865,...,-54.6708,-8.321065,-144.6627,-3.609724,-15.405971,-3.179097,-61.043467,-73.44829,-0.306993,-0.991432
25%,-0.417872,-0.4737693,-0.555828,-0.5228084,-0.4875468,-0.421434,-0.5273877,-0.4285306,-0.480802,-0.447698,...,-0.4797282,-0.513193,-0.4841502,-0.499058,-0.498534,-0.4820913,-0.446251,-0.4912795,-0.23007,-0.3573717
50%,0.0,4.967001e-18,0.0,1.08759e-18,2.660478e-18,0.0,3.095482e-18,-3.225501e-18,0.0,0.0,...,-4.1741780000000004e-18,0.0,2.8019850000000002e-18,0.0,0.0,6.118966e-18,0.0,6.615751e-18,0.0,2.1599339999999999e-19
75%,0.582128,0.5262307,0.444172,0.4771916,0.5124532,0.578566,0.4726123,0.5714694,0.519198,0.552302,...,0.5202718,0.486807,0.5158498,0.500942,0.501466,0.5179087,0.553749,0.5087205,0.76993,0.6426283
max,1.09112,15.31595,2.084305,10.34539,26.72848,20.651105,39.27069,37.1633,8.435793,15.597806,...,65.50973,7.814316,72.79295,5.016704,11.205024,5.594575,68.67636,258.0979,180.265455,1.036549


In [7]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력은 (1:정상, -1:불량(사기))
    # csv 데이터의 (0:정상, 1:불량(사기))에서 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

## Train

### Local Outlier Factor (LOF)

## Test

In [None]:
test_df = pd.read_csv('./test.csv') # Train
test_df.head()

In [None]:
test_x = test_df.drop(columns=['ID'])

In [None]:
test_pred = model.predict(test_x) # model prediction
test_pred = get_pred_label(test_pred)

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()

In [12]:
submit['Class'] = test_pred
submit.to_csv('./submit.csv', index=False)