# Credit Card Fraud Detection

In [2]:
import pandas as pd
import numpy as np

# Import and store dataset
credit_card_data = pd.read_csv('CSV_Files/creditcard.csv')

np.__version__
#print(credit_card_data)

# Splitting data into 4 sets
# 1. Shuffle/randomize data
# 2. One-hot encoding
# 3. Normalize
# 4. Splitting up X/y values
# 5. Convert data_frames to numpy arrays (float32)
# 6. Splitting the final data into X/y train/test

# Shuffle and randomize data
shuffled_data = credit_card_data.sample(frac=1)
# Change Class column into Class_0 ([1 0] for legit data) and Class_1 ([0 1] for fraudulent data)
one_hot_data = pd.get_dummies(shuffled_data, columns=['Class'])
# Change all values into numbers between 0 and 1
normalized_data = (one_hot_data - one_hot_data.min()) / (one_hot_data.max() - one_hot_data.min())
# Store just columns V1 through V28 in df_X and columns Class_0 and Class_1 in df_y
df_X = normalized_data.drop(['Class_0', 'Class_1'], axis=1)
df_y = normalized_data[['Class_0', 'Class_1']]
# Convert both data_frames into np arrays of float32
ar_X, ar_y = np.asarray(df_X.values, dtype='float32'), np.asarray(df_y.values, dtype='float32')
# Allocate first 80% of data into training data and remaining 20% into testing data
train_size = int(0.8 * len(ar_X))
(raw_X_train, raw_y_train) = (ar_X[:train_size], ar_y[:train_size])
(raw_X_test, raw_y_test) = (ar_X[train_size:], ar_y[train_size:])

# Gets a percent of fraud vs legit transactions (0.0017% of transactions are fraudulent)
count_legit, count_fraud = np.unique(credit_card_data['Class'], return_counts=True)[1]
fraud_ratio = float(count_fraud / (count_legit + count_fraud))
print('Percent of fraudulent transactions: ', fraud_ratio)

#Applies a logit weighting of 578 (1/0.0017) to fraudulent transactions to cause model to pay more attention to them
weighting = 1 / fraud_ratio
raw_y_train[:, 1] = raw_y_train[:, 1] * weighting

import tensorflow.compat.v1 as tf

# tf.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

# 30 cells for the input
input_dimensions = ar_X.shape[1]
# 2 cells for the output
output_dimensions = ar_y.shape[1]
# 100 cells for the 1st layer
num_layer_1_cells = 100
# 150 cells for the second layer
num_layer_2_cells = 150

# We will use these as inputs to the model when it comes time to train it (assign values at run time)
X_train_node = tf.placeholder(tf.float32, [None, input_dimensions], name='X_train')
y_train_node = tf.placeholder(tf.float32, [None, output_dimensions], name='y_train')

# We will use these as inputs to the model once it comes time to test it
X_test_node = tf.constant(raw_X_test, name='X_test')
y_test_node = tf.constant(raw_y_test, name='y_test')

# First layer takes in input and passes output to 2nd layer
weight_1_node = tf.Variable(tf.zeros([input_dimensions, num_layer_1_cells]), name='weight_1')
biases_1_node = tf.Variable(tf.zeros([num_layer_1_cells]), name='biases_1')

# Second layer takes in input from 1st layer and passes output to 3rd layer
weight_2_node = tf.Variable(tf.zeros([num_layer_1_cells, num_layer_2_cells]), name='weight_2')
biases_2_node = tf.Variable(tf.zeros([num_layer_2_cells]), name='biases_2')

# Third layer takes in input from 2nd layer and outputs [1 0] or [0 1] depending on fraud vs legit
weight_3_node = tf.Variable(tf.zeros([num_layer_2_cells, output_dimensions]), name='weight_3')
biases_3_node = tf.Variable(tf.zeros([output_dimensions]), name='biases_3')


# Function to run an input tensor through the 3 layers and output a tensor that will give us a fraud/legit result
# Each layer uses a different function to fit lines through the data and predict whether a given input tensor will
# result in a fraudulent or legitimate transaction
def network(input_tensor):
    # Sigmoid fits modified data well
    layer1 = tf.nn.sigmoid(tf.matmul(input_tensor, weight_1_node) + biases_1_node)
    # Dropout prevents model from becoming lazy and over confident
    layer2 = tf.nn.dropout(tf.nn.sigmoid(tf.matmul(layer1, weight_2_node) + biases_2_node), 0.85)
    # Softmax works very well with one hot encoding which is how results are outputted
    layer3 = tf.nn.softmax(tf.matmul(layer2, weight_3_node) + biases_3_node)
    return layer3


# Used to predict what results will be given training or testing input data
# Remember, X_train_node is just a placeholder for now. We will enter values at run time
y_train_prediction = network(X_train_node)
y_test_prediction = network(X_test_node)

# Cross entropy loss function measures differences between actual output and predicted output
cross_entropy = tf.losses.softmax_cross_entropy(y_train_node, y_train_prediction)

# Adam optimizer function will try to minimize loss (cross_entropy) but changing the 3 layers' variable values at a
# learning rate of 0.005
optimizer = tf.train.AdamOptimizer(0.005).minimize(cross_entropy)


# Function to calculate the accuracy of the actual result vs the predicted result
def calculate_accuracy(actual, predicted):
    actual = np.argmax(actual, 1)
    predicted = np.argmax(predicted, 1)
    return (100 * np.sum(np.equal(predicted, actual)) / predicted.shape[0])

num_epochs = 100

import time

with tf.Session() as session:
    tf.global_variables_initializer().run()
    for epoch in range(num_epochs):

        start_time = time.time()

        _, cross_entropy_score = session.run([optimizer, cross_entropy],
                                             feed_dict={X_train_node: raw_X_train, y_train_node: raw_y_train})
#         print(tf.size(X_train_node))

        if epoch % 10 == 0:
            timer = time.time() - start_time

            print('Epoch: {}'.format(epoch), 'Current loss: {0:.4f}'.format(cross_entropy_score),
                  'Elapsed time: {0:.2f} seconds'.format(timer))

            final_y_test = y_test_node.eval()
            final_y_test_prediction = y_test_prediction.eval()
            final_accuracy = calculate_accuracy(final_y_test, final_y_test_prediction)
            print("Current accuracy: {0:.2f}%".format(final_accuracy))

    final_y_test = y_test_node.eval()
    final_y_test_prediction = y_test_prediction.eval()
    final_accuracy = calculate_accuracy(final_y_test, final_y_test_prediction)
    print("Final accuracy: {0:.2f}%".format(final_accuracy))

final_fraud_y_test = final_y_test[final_y_test[:, 1] == 1]
final_fraud_y_test_prediction = final_y_test_prediction[final_y_test[:, 1] == 1]
final_fraud_accuracy = calculate_accuracy(final_fraud_y_test, final_fraud_y_test_prediction)
print('Final fraud specific accuracy: {0:.2f}%'.format(final_fraud_accuracy))


Percent of fraudulent transactions:  0.001727485630620034
Epoch: 0 Current loss: 1.3700 Elapsed time: 0.34 seconds
Current accuracy: 99.81%
Epoch: 10 Current loss: 1.3677 Elapsed time: 0.26 seconds
Current accuracy: 12.03%
Epoch: 20 Current loss: 1.3352 Elapsed time: 0.27 seconds
Current accuracy: 73.74%
Epoch: 30 Current loss: 1.1917 Elapsed time: 0.26 seconds
Current accuracy: 98.25%
Epoch: 40 Current loss: 1.0067 Elapsed time: 0.26 seconds
Current accuracy: 98.73%
Epoch: 50 Current loss: 0.9020 Elapsed time: 0.26 seconds
Current accuracy: 99.29%
Epoch: 60 Current loss: 0.8476 Elapsed time: 0.27 seconds
Current accuracy: 99.80%
Epoch: 70 Current loss: 0.8249 Elapsed time: 0.28 seconds
Current accuracy: 99.81%
Epoch: 80 Current loss: 0.8062 Elapsed time: 0.28 seconds
Current accuracy: 99.61%
Epoch: 90 Current loss: 0.7980 Elapsed time: 0.27 seconds
Current accuracy: 99.41%
Final accuracy: 99.82%
Final fraud specific accuracy: 80.37%


## Basic Outputs

In [4]:
credit_card_data.sample(frac=1) # Shuffled Data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
27260,34468.0,-0.707406,-0.320654,2.116384,-3.817880,-1.037899,-1.157513,0.247264,-0.277725,0.578492,...,-0.109384,0.373900,-0.317614,0.576847,0.418381,-0.773982,-0.163617,-0.142157,22.75,0
149724,91805.0,2.112087,0.140567,-1.512163,0.631141,0.328237,-1.273783,0.416259,-0.543936,1.658423,...,-0.197056,-0.085191,0.073871,-0.106779,0.181114,0.529405,-0.121911,-0.085276,0.00,0
160405,113344.0,-2.018896,-0.865235,0.208403,1.597729,1.378682,-1.158660,1.424418,-0.136448,-0.895810,...,0.117797,-0.602548,0.563987,-0.280433,0.954904,-0.428603,-0.093968,0.136336,325.10,0
241244,150946.0,2.115738,-0.024751,-1.366591,0.233706,0.278598,-0.768632,0.211223,-0.322503,0.498047,...,-0.331499,-0.786182,0.239725,-0.698616,-0.176785,0.240468,-0.067615,-0.066216,1.79,0
256624,157780.0,-1.000951,0.092664,0.993434,-2.372935,-0.828564,0.447452,-0.030684,0.229574,-0.716573,...,0.574577,1.698488,-0.245973,0.786508,-0.233665,-0.224636,-0.465867,-0.145249,99.63,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13595,24083.0,1.148494,0.261323,0.819129,1.482143,-0.505666,-0.637016,-0.166071,-0.212335,1.438847,...,0.041669,0.357511,-0.146462,0.356061,0.590253,-0.265012,0.002858,0.027749,40.30,0
65098,51427.0,-0.814584,-0.562000,2.249918,-1.626980,-0.887820,0.214793,-0.217454,0.311487,-0.651560,...,0.170801,0.318908,-0.022815,0.003637,0.137963,-0.404086,0.064765,0.096780,95.58,0
37717,39076.0,1.117749,-1.544665,-0.145357,-1.307482,-1.392833,-0.630833,-0.563659,-0.038716,-2.173892,...,-0.334155,-1.108220,0.070461,0.115485,0.078726,-0.460844,-0.029143,0.029028,183.19,0
10680,18061.0,-1.171358,0.779933,2.452124,-1.698950,-0.247130,-0.227596,0.527284,0.035597,1.782288,...,-0.240604,-0.310657,-0.290945,-0.083429,0.257523,0.455880,0.116261,-0.082435,39.99,0


In [5]:
pd.get_dummies(shuffled_data, columns=['Class']) #One-Hot-Data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class_0,Class_1
122673,76648.0,-0.839197,0.984687,-0.316741,0.223696,2.211161,3.862391,-0.202664,1.315757,-0.708427,...,-0.236144,-0.239256,1.019107,0.366559,-0.215970,-0.118382,0.037242,22.58,1,0
61920,50049.0,1.256199,-0.015231,-1.469229,-0.093911,2.128520,3.459439,-0.660356,0.902599,0.461278,...,-0.584964,-0.130329,0.941959,0.706134,0.426656,0.003528,0.036224,12.31,1,0
22377,32215.0,1.301938,0.254550,-0.313735,0.318508,0.495277,0.201020,-0.012140,0.021446,0.119586,...,-0.973633,-0.019085,-1.367371,0.329302,0.210307,-0.009191,0.014162,6.88,1,0
171760,120800.0,-1.692418,-1.685046,2.101802,-1.841842,0.864505,-0.534625,-0.976574,0.302474,-0.301231,...,-0.238390,-0.031030,-0.776897,0.656432,-0.279035,-0.016226,0.079810,56.00,1,0
114928,73678.0,-0.820922,0.572614,2.047231,0.025542,0.436528,-0.540911,0.282369,0.093850,-0.187276,...,-0.095015,-0.153644,-0.158692,0.020820,-0.623517,0.105864,0.124468,2.50,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181439,124973.0,-0.076355,0.536324,0.266222,-0.546938,-0.061743,-0.044964,-0.208108,0.487463,0.353722,...,0.972050,0.025150,0.553081,-0.666679,0.461973,-0.050401,-0.010450,11.53,1,0
61516,49875.0,-1.343802,0.139507,2.525678,0.029761,-0.255481,1.038981,-0.179687,0.173212,1.224492,...,0.506282,-0.774488,-0.404616,0.340409,0.640504,-0.871851,-0.786528,22.45,1,0
228958,145735.0,-0.497182,0.324551,0.289849,-0.786302,1.124324,-1.019470,0.723914,-0.041791,-0.258870,...,1.001613,-0.305009,-0.337627,-0.054100,-0.181593,0.122473,0.161939,4.51,1,0
176573,122826.0,-0.878073,-0.351656,-1.440820,-4.729482,0.330640,0.245026,1.101921,0.448696,-0.301924,...,0.676822,0.263636,-1.652350,-0.170024,-0.907102,0.331618,0.188944,188.53,1,0


In [6]:
normalized_data = (one_hot_data - one_hot_data.min()) / (one_hot_data.max() - one_hot_data.min()) 
print(normalized_data)

            Time        V1        V2        V3        V4        V5        V6  \
136759  0.473714  0.947309  0.780322  0.859819  0.253685  0.765745  0.255759   
24484   0.192474  0.978475  0.768679  0.834708  0.305529  0.765228  0.262739   
201143  0.774017  0.994720  0.771342  0.790368  0.266944  0.772690  0.248850   
233778  0.854779  0.993078  0.766157  0.818733  0.270931  0.764409  0.250685   
124583  0.447648  0.977634  0.766312  0.846188  0.282300  0.763132  0.263698   
...          ...       ...       ...       ...       ...       ...       ...   
86596   0.354768  0.976638  0.762247  0.858729  0.254661  0.758353  0.265210   
55883   0.272999  0.937147  0.780604  0.855048  0.277047  0.761978  0.265951   
244933  0.882807  0.990010  0.762806  0.830222  0.263967  0.763100  0.263062   
46659   0.248131  0.977004  0.748471  0.857386  0.199675  0.754735  0.278050   
102634  0.395203  0.978062  0.767197  0.840158  0.261150  0.761687  0.247913   

              V7        V8        V9  .

In [8]:
one_hot_data.min()

Time         0.000000
V1         -56.407510
V2         -72.715728
V3         -48.325589
V4          -5.683171
V5        -113.743307
V6         -26.160506
V7         -43.557242
V8         -73.216718
V9         -13.434066
V10        -24.588262
V11         -4.797473
V12        -18.683715
V13         -5.791881
V14        -19.214325
V15         -4.498945
V16        -14.129855
V17        -25.162799
V18         -9.498746
V19         -7.213527
V20        -54.497720
V21        -34.830382
V22        -10.933144
V23        -44.807735
V24         -2.836627
V25        -10.295397
V26         -2.604551
V27        -22.565679
V28        -15.430084
Amount       0.000000
Class_0      0.000000
Class_1      0.000000
dtype: float64

In [9]:
df_X = normalized_data.drop(['Class_0', 'Class_1'], axis=1)
print(df_X)

            Time        V1        V2        V3        V4        V5        V6  \
136759  0.473714  0.947309  0.780322  0.859819  0.253685  0.765745  0.255759   
24484   0.192474  0.978475  0.768679  0.834708  0.305529  0.765228  0.262739   
201143  0.774017  0.994720  0.771342  0.790368  0.266944  0.772690  0.248850   
233778  0.854779  0.993078  0.766157  0.818733  0.270931  0.764409  0.250685   
124583  0.447648  0.977634  0.766312  0.846188  0.282300  0.763132  0.263698   
...          ...       ...       ...       ...       ...       ...       ...   
86596   0.354768  0.976638  0.762247  0.858729  0.254661  0.758353  0.265210   
55883   0.272999  0.937147  0.780604  0.855048  0.277047  0.761978  0.265951   
244933  0.882807  0.990010  0.762806  0.830222  0.263967  0.763100  0.263062   
46659   0.248131  0.977004  0.748471  0.857386  0.199675  0.754735  0.278050   
102634  0.395203  0.978062  0.767197  0.840158  0.261150  0.761687  0.247913   

              V7        V8        V9  .

In [10]:
df_y = normalized_data[['Class_0', 'Class_1']]
print(df_y)

        Class_0  Class_1
136759      1.0      0.0
24484       1.0      0.0
201143      1.0      0.0
233778      1.0      0.0
124583      1.0      0.0
...         ...      ...
86596       1.0      0.0
55883       1.0      0.0
244933      1.0      0.0
46659       1.0      0.0
102634      1.0      0.0

[284807 rows x 2 columns]


In [12]:
ar_X, ar_y = np.asarray(df_X.values, dtype='float32'), np.asarray(df_y.values, dtype='float32')
print(ar_X)

[[4.7371405e-01 9.4730937e-01 7.8032184e-01 ... 4.2337692e-01
  3.1627369e-01 1.7476828e-04]
 [1.9247419e-01 9.7847480e-01 7.6867920e-01 ... 4.1704097e-01
  3.1373540e-01 5.1652006e-04]
 [7.7401733e-01 9.9471986e-01 7.7134216e-01 ... 4.1577402e-01
  3.1206670e-01 1.5297091e-04]
 ...
 [8.8280708e-01 9.9000955e-01 7.6280570e-01 ... 4.1695261e-01
  3.1263420e-01 1.9458055e-03]
 [2.4813069e-01 9.7700405e-01 7.4847144e-01 ... 4.1893715e-01
  3.1377369e-01 3.3474548e-03]
 [3.9520347e-01 9.7806156e-01 7.6719695e-01 ... 4.1439626e-01
  3.1345311e-01 2.5678093e-03]]


In [14]:
train_size = int(0.8 * len(ar_X))
print(train_size)

227845


In [15]:
raw_X_train

array([[4.7371405e-01, 9.4730937e-01, 7.8032184e-01, ..., 4.2337692e-01,
        3.1627369e-01, 1.7476828e-04],
       [1.9247419e-01, 9.7847480e-01, 7.6867920e-01, ..., 4.1704097e-01,
        3.1373540e-01, 5.1652006e-04],
       [7.7401733e-01, 9.9471986e-01, 7.7134216e-01, ..., 4.1577402e-01,
        3.1206670e-01, 1.5297091e-04],
       ...,
       [3.2823280e-01, 9.7808051e-01, 7.7085721e-01, ..., 4.1709685e-01,
        3.1358060e-01, 1.5180319e-04],
       [6.9079590e-01, 9.4696945e-01, 7.8320301e-01, ..., 4.2061251e-01,
        3.1591466e-01, 2.0832068e-03],
       [9.1988057e-01, 9.5967406e-01, 7.7880114e-01, ..., 4.1597149e-01,
        3.1342179e-01, 6.1032665e-04]], dtype=float32)

In [16]:
raw_y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [17]:
raw_X_test

array([[0.49102968, 0.9779506 , 0.7633109 , ..., 0.41786912, 0.313986  ,
        0.00155657],
       [0.35161927, 0.9791837 , 0.75967646, ..., 0.41515136, 0.31334576,
        0.00240005],
       [0.8125145 , 0.9558728 , 0.7764009 , ..., 0.4154647 , 0.31444487,
        0.00324275],
       ...,
       [0.8828071 , 0.99000955, 0.7628057 , ..., 0.4169526 , 0.3126342 ,
        0.00194581],
       [0.2481307 , 0.97700405, 0.74847144, ..., 0.41893715, 0.3137737 ,
        0.00334745],
       [0.39520347, 0.97806156, 0.76719695, ..., 0.41439626, 0.3134531 ,
        0.00256781]], dtype=float32)

In [20]:
raw_y_test

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [21]:
np.unique(credit_card_data['Class'], return_counts=True)[1]

array([284315,    492])

In [22]:
np.unique(credit_card_data['Class'], return_counts=True)

(array([0, 1]), array([284315,    492]))

In [23]:
np.unique(credit_card_data['Class'])

array([0, 1])

In [25]:
weighting = 1 / fraud_ratio
raw_y_train[:, 1] = raw_y_train[:, 1] * weighting
print(weighting)

578.8760162601626


In [26]:
ar_X.shape[1]

30

In [29]:
ar_y.shape[1]

2

In [6]:
x = tf.Variable(tf.zeros([input_dimensions, num_layer_1_cells]), name='weight_1')
y = tf.Variable(tf.zeros([num_layer_1_cells]), name='biases_1')

init = tf.global_variables_initializer()

sess = tf.Session()
sess.run(init)
# your_var = sess.run(x)
your_var = sess.run(y)
print(weight_1_node)

<tf.Variable 'weight_1:0' shape=(30, 100) dtype=float32>


In [5]:
print(weight_1_node)

<tf.Variable 'weight_1:0' shape=(30, 100) dtype=float32>
