<a href="https://colab.research.google.com/github/JHyunjun/SNU/blob/main/data_imbalance_Creditcard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Credit Card Fraud Detection

In [116]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [117]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler

### Load Data

In [118]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/snu/w4/실습자료/hw-fnnae-creditcard/data/creditcard.csv')

In [119]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Split dataset into train and test 

In [120]:
fraud = df[df.Class == 1]
normal = df[df.Class == 0]

In [121]:
# To distribute classes evenly in both train and test dataset
X_train = pd.concat([fraud.sample(frac = 0.8, random_state = 0), 
                     normal.sample(frac = 0.8, random_state = 0)],
                     axis = 0)
# Create test data using a non-index of the train data set
X_test = df.loc[~df.index.isin(X_train.index)]

In [122]:
X_train['Class']

177195    1
30384     1
151011    1
46918     1
6717      1
         ..
233352    0
42092     0
26402     0
127718    0
269464    0
Name: Class, Length: 227846, dtype: int64

In [123]:
# Use `shuffle` function from sklearn.utils
# Shuffle the sorted data can increase performances of model
from sklearn.utils import shuffle

X_train = shuffle(X_train, random_state=0)
X_test = shuffle(X_test, random_state=0)

In [124]:
y_train = X_train['Class'].values.astype('float')
y_train = np.stack([1 - y_train, y_train]).T

y_test = X_test['Class'].values.astype('float')
y_test = np.stack([1 - y_test, y_test]).T

In [125]:
print(y_train)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [126]:
# Abnormal Data의 수가 적으므로 Loss값이 커지게 Abnormal Data에 곱해줌(Weight조절)
ratio = len(X_train) / len(X_train[X_train.Class == 1])
y_train[:,1] *= ratio
print(ratio)
print(y_train)

578.2893401015228
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


### Normalize data

In [127]:
#Class column을 빼내는 부분
fields = ['Class']
X_train = X_train.drop(fields, axis=1)
X_test = X_test.drop(fields, axis=1)

In [128]:
# Data preprocessing

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Define the model

In [129]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

In [135]:
def create_mlp(pkeep=0.5):
  model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dropout(pkeep),
    tf.keras.layers.Dense(2, activation = 'softmax')                            
  ])
  return model

### Train the model

In [136]:
pkeep = 0.5
batch_size = 256
epochs = 10

In [137]:
model = create_mlp(pkeep)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [138]:
# 0.1을 잘라서 EarlyStopping
# Validation Loss가 7번올라가면 Stop하겠다.
model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, validation_split = 0.1, callbacks = [EarlyStopping(monitor = "val_loss", patience= 7)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3e30061b90>

### Evaluate the model with other metrics

In [147]:
# Accuracy는 70%대이나 Abnormal때문에 이걸로는 정확한 판단이 어려움
preds = tf.argmax(model.predict(X_test), 1)
labels = tf.argmax(y_test,1)
correct = tf.equal(preds, labels)
tf.math.reduce_mean(tf.cast(correct, dtype = tf.float32))


<tf.Tensor: shape=(), dtype=float32, numpy=0.68624145>

In [143]:
from keras.metrics import Recall
from keras.metrics import Precision

In [144]:
recall = Recall()
precision = Precision()

In [148]:
rec_result = recall(labels,preds).numpy()
pre_result = precision(labels,preds).numpy()

In [150]:
print("recall : ",rec_result*100, "%")
print("Precision : ", pre_result*100, "%")

recall :  96.93877696990967 %
Precision :  0.5288354586809874 %
