<a href="https://colab.research.google.com/github/Lee-Gunju/AI-paper-code-review-for-personal-project/blob/master/Imbalanced_classification_credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import numpy as np

In [None]:
fname = '/content/creditcard.csv'

In [None]:
all_features = []
all_targets = []

with open(fname) as f:
  for i, line in enumerate(f):
    if i == 0:
      print('header', line.strip())
      continue
    fields = line.strip().split(",")
    all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
    all_targets.append([int(fields[-1].replace('"', ""))])
    if i == 1:
      print('example feature', all_features[-1])


features = np.array(all_features, dtype = 'float32')
targets = np.array(all_targets, dtype='uint8')
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

header "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
example feature [0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62]
features.shape: (284807, 30)
targets.shape: (284807, 1)


In [None]:
features[0]

array([ 0.0000000e+00, -1.3598071e+00, -7.2781175e-02,  2.5363467e+00,
        1.3781552e+00, -3.3832076e-01,  4.6238777e-01,  2.3959856e-01,
        9.8697901e-02,  3.6378697e-01,  9.0794168e-02, -5.5159956e-01,
       -6.1780083e-01, -9.9138987e-01, -3.1116936e-01,  1.4681770e+00,
       -4.7040051e-01,  2.0797125e-01,  2.5790580e-02,  4.0399295e-01,
        2.5141209e-01, -1.8306779e-02,  2.7783757e-01, -1.1047391e-01,
        6.6928074e-02,  1.2853935e-01, -1.8911484e-01,  1.3355838e-01,
       -2.1053053e-02,  1.4962000e+02], dtype=float32)

In [None]:
num_val_samples = int(len(features) * 0.2)
train_features = features[:-num_val_samples]
train_targets = targets[:-num_val_samples]
val_features = features[-num_val_samples:]
val_targets = targets[-num_val_samples:]


print("Number of training samples:", len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 227846
Number of validation samples: 56961


In [None]:
counts = np.bincount(train_targets[:, 0])

print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 417 (0.18% of total)


In [None]:
mean = np.mean(train_features, axis = 0)

In [None]:
train_features -= mean

In [None]:
val_features -= mean

In [None]:
std = np.std(train_features, axis=0)

In [None]:
train_features /= std 
val_features /= std

In [None]:
from tensorflow import keras

In [None]:
model = keras.Sequential([
                          keras.layers.Dense(256, activation='relu', input_shape = (train_features.shape[-1],)),
                          keras.layers.Dense(256, activation='relu'),
                          keras.layers.Dropout(0.3),
                          keras.layers.Dense(256, activation="relu"),
                          keras.layers.Dropout(0.3),
                          keras.layers.Dense(1, activation='sigmoid')
])


model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               7936      
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 139,777
Trainable params: 139,777
Non-trainable params: 0
__________________________________________________

In [None]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
    keras.metrics.BinaryAccuracy(name = 'binary_acc')
]

model.compile(optimizer=keras.optimizers.Adam(1e-2), loss = 'binary_crossentropy', metrics=metrics)

callbacks = [keras.callbacks.ModelCheckpoint('fraud_model_at_epoch_{epoch}.h5')]

class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    verbose=2,
    callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight,
)

Epoch 1/30
112/112 - 3s - loss: 2.4459e-07 - fn: 6.0000 - fp: 2015.0000 - tn: 225414.0000 - tp: 411.0000 - precision: 0.1694 - recall: 0.9856 - binary_acc: 0.9911 - val_loss: 0.0188 - val_fn: 10.0000 - val_fp: 309.0000 - val_tn: 56577.0000 - val_tp: 65.0000 - val_precision: 0.1738 - val_recall: 0.8667 - val_binary_acc: 0.9944
Epoch 2/30
112/112 - 1s - loss: 1.9414e-07 - fn: 1.0000 - fp: 1682.0000 - tn: 225747.0000 - tp: 416.0000 - precision: 0.1983 - recall: 0.9976 - binary_acc: 0.9926 - val_loss: 0.0353 - val_fn: 10.0000 - val_fp: 297.0000 - val_tn: 56589.0000 - val_tp: 65.0000 - val_precision: 0.1796 - val_recall: 0.8667 - val_binary_acc: 0.9946
Epoch 3/30
112/112 - 1s - loss: 2.1050e-07 - fn: 1.0000 - fp: 1489.0000 - tn: 225940.0000 - tp: 416.0000 - precision: 0.2184 - recall: 0.9976 - binary_acc: 0.9935 - val_loss: 0.1119 - val_fn: 8.0000 - val_fp: 730.0000 - val_tn: 56156.0000 - val_tp: 67.0000 - val_precision: 0.0841 - val_recall: 0.8933 - val_binary_acc: 0.9870
Epoch 4/30
112/11

<tensorflow.python.keras.callbacks.History at 0x7f6994393d10>

In [None]:
model.evaluate(x = val_features, y = val_targets)



[0.010843335650861263,
 13.0,
 137.0,
 56749.0,
 62.0,
 0.311557799577713,
 0.8266666531562805,
 0.9973666071891785]