# Credit Card Fraud Detection and Prediction Using Autoencoder
This project demonstrates the use of unsupervised training for Credit Card Fraud Detection and Prediction. The main model implemented here is Autoencoder, which achives 0.96 of AUC on test set.

# Load libraries 

In [None]:
import pandas as pd
import numpy as np
import pickle

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
import os
from datetime import datetime 
import seaborn as sns

from pylab import rcParams
import sklearn
from sklearn.metrics import roc_auc_score as auc 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 10, 10
LABELS = ["Normal", "Fraud"]

# Data Exploration

The dataset we're going to use can be downloaded from [Kaggle](https://www.kaggle.com/mlg-ulb/creditcardfraud).The dataset contains transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [None]:
data_fdp = pd.read_csv('../input/creditcardfraud/creditcard.csv')

In [None]:
data_fdp.isnull().values.any()

In [None]:
data_fdp.head()

In [None]:
data_fdp.describe()

In [None]:
data_fdp.columns

In [None]:
data_fdp.dtypes

## Check for missing data 

In [None]:
plt.figure(figsize=(7,7))
count_classes = pd.value_counts(data_fdp['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction class distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency");

We have a highly imbalanced dataset on our hands. Normal transactions overwhelm the fraudulent ones by a large margin. Let's look at the two types of transactions: 

In [None]:
plt.figure(figsize=(20,7*7))
gs = gridspec.GridSpec(7, 2)
for i, cn in enumerate(data_fdp.columns[:13]):
    ax = plt.subplot(gs[i])
    sns.histplot(data_fdp[cn][data_fdp.Class == 1], bins=50)
    sns.histplot(data_fdp[cn][data_fdp.Class == 0], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
plt.show()

In [None]:
corrmat = data_fdp.corr()
fig = plt.figure(figsize = (12, 9))
sns.heatmap(corrmat, vmax = .8, square = True)
plt.show()

In [None]:
frauds = data_fdp[data_fdp.Class == 1]
normal = data_fdp[data_fdp.Class == 0]

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')

bins = 50

ax1.hist(frauds.Amount, bins = bins)
ax1.set_title('Fraud')

ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')

plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show();

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')

ax1.scatter(frauds.Time, frauds.Amount)
ax1.set_title('Fraud')

ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

# Train and Test based Time series, 
I splited the data based on the Time column, using first 75% as training/val, and last 25% as test. Since this is a very unbalanced data, the first 75% as training and validation data, and the later 25% as test, based on Time column. This is just to ensure we won’t have a test set that contains too few of positive cases (in case you want to give 90 – 10 split, for example).

Hence, the train_x and test_x will be our data that we will feed to model

In [None]:
TEST_RATIO = 0.25
data_fdp.sort_values('Time', inplace = True)
TRA_INDEX = int((1-TEST_RATIO) * data_fdp.shape[0])
train_x = data_fdp.iloc[:TRA_INDEX, 1:-2].values
train_y = data_fdp.iloc[:TRA_INDEX, -1].values

test_x = data_fdp.iloc[TRA_INDEX:, 1:-2].values
test_y = data_fdp.iloc[TRA_INDEX:, -1].values

In [None]:
print("Total train examples: {}, total fraud cases: {}, equal to {:.5f} of total cases. ".format(
    train_x.shape[0], np.sum(train_y), np.sum(train_y)/train_x.shape[0]))

In [None]:
print("Total test examples: {}, total fraud cases: {}, equal to {:.5f} of total cases. ".format(
    test_x.shape[0], np.sum(test_y), np.sum(test_y)/test_y.shape[0]))

# Data Standardization and Activation functions

I have considered two types of standardization here – 
1. z-score 
2. min-max scaling.

### Feature Normalization - z score (used for tanh activation)

**z-score** will normalize each column into having mean of zero and standardization of ones, which will be good choice if we are using some sort of output functions like **tanh activation function**, that outputs values on both sides of zero. Besides, this will leave values that are too extreme to still keep some extremeness left after normalization (e.g. to have more than 2 standard deviations away). This might be useful to detect outliers in this case.

In [None]:
cols_mean = []
cols_std = []
for c in range(train_x.shape[1]):
    cols_mean.append(train_x[:,c].mean())
    cols_std.append(train_x[:,c].std())
    train_x[:, c] = (train_x[:, c] - cols_mean[-1]) / cols_std[-1]
    test_x[:, c] =  (test_x[:, c] - cols_mean[-1]) / cols_std[-1]

### Feature Normalization - min max score (used for sigmoid activation)

**min-max scaling** approach will ensure all values to be within 0 - 1, all positive. This is the default approach if we are using **sigmoid activation function** as our output activation.

In [None]:
cols_mean = []
cols_std = []
for c in range(train_x.shape[1]):
    cols_mean.append(train_x[:,c].mean())
    cols_std.append(train_x[:,c].std())
    train_x[:, c] = (train_x[:, c] - cols_mean[-1]) / cols_std[-1]
    test_x[:, c] =  (test_x[:, c] - cols_mean[-1]) / cols_std[-1]

The differences between sigmoid and tanh is shown in the image below (sigmoid will squash the values into range between (0, 1); whereas tanh, or hyperbolic tangent, squash them into (-1, 1)):

![](https://vanishingcodes.files.wordpress.com/2017/06/tanh-and-sigmoid.png)

I have used validation set to decide for the data standardization approach as well as activation functions. Based on related works, I found tanh to perform better than sigmoid when used together with z-score normalization. Therefore, I chose tanh followed by z-score

# Autoencoder



## Autoencoder as unsupervised learning

Autoencoder is one type of neural networks that approximates the function: **f(x) = x**. Basically, given an **input x**, the network will learn to **output f(x) i.e as close as to x**. The **error between output and input (x)** is commonly measured using **Root Mean Square Error (RMSE) – mean((f(x) – x)<sup>2</sup>)** – which is the loss function we try to minimise in our network.

An autoencoder looks like one below. It follows a typical feed-forward neural networks architecture except that the output layer has exactly same number of neurons as input layer. And it uses the input data itself as its target. Therefore it works in a way of unsupervised learning – learn without predicting an actual label.

The lower part of the network shown below is usually called an ‘encoder’ – whose job is to ’embed’ the input data into a lower dimensional array. The upper part of network, or ‘decoder’, will try to decode the embedding array into the original one.

We can have either one hidden layer, or in the case below, have multiple layers depending on the complexity of our features.

![](https://vanishingcodes.files.wordpress.com/2017/06/stackedae.png)




## Autoencoder for Fraud Detection and Prediction

We rely on autoencoder to ‘learn’ and ‘memorize’ the common patterns that are shared by the majority training data. And during reconstruction, the **RMSE** will be high for the data who do not conform to those patterns. And these are the **‘anomalies’** we are detecting. And hopefully, these ‘anomalies’ are also equal to the **‘fraudulent’** transactions we are after.

### During prediction – 
1. We can select a threshold for RMSE based on validation data and flag all data with RMSE above the threshold as fraudulent. 
2. Alternatively, if we believe 0.1% of all transactions are fraudulent, we can also rank the data based on reconstruction error for each data (i.e. the RMSEs), then select the top 0.1% to be the frauds.

### Evaluation metric – 
We will evaluate our model’s performance using AUC score on test data set.

In [None]:
# Parameters
learning_rate = 0.001
training_epochs = 10
batch_size = 256
display_step = 1

# Network Parameters
n_hidden_1 = 15 # 1st layer num features
#n_hidden_2 = 15 # 2nd layer num features
n_input = train_x.shape[1] # MNIST data input (img shape: 28*28)
data_dir = '.'

### Build the Model - (1 hidden layer turned out to be enough)

The **first and second layers contain 15 and 5 neurons** respectively, we are building a network of such architecture: 28(input) -> 15 -> 5 -> 15 -> 28(output).

The activation functions for each layer used is tanh, as I explained earlier. 
The objective function here – or the cost as above – measures the total RMSE of our predicted and input arrays in one batch – which means it’s a scalar. We then run the optimizer every time we want to do a batch update.

However, we have another batch_mse here will return RMSEs for each input data in a batch – which is a vector of length that equals to number of rows in input data. 
These will be the predicted values – or fraud scores if you want to call it – for the input (be it training, validation or test data), which we can extract out after prediction.

In [None]:
X = tf.placeholder("float", [None, n_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    #'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'decoder_h1': tf.Variable(tf.random_normal([n_hidden_1, n_input])),
    #'decoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_input])),
}
biases = {
    'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
    #'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'decoder_b1': tf.Variable(tf.random_normal([n_input])),
    #'decoder_b2': tf.Variable(tf.random_normal([n_input])),
}


# Building the encoder
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.tanh(tf.add(tf.matmul(x, weights['encoder_h1']),
                                   biases['encoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    #layer_2 = tf.nn.tanh(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
                                   #biases['encoder_b2']))
    return layer_1


# Building the decoder
def decoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.tanh(tf.add(tf.matmul(x, weights['decoder_h1']),
                                   biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    #layer_2 = tf.nn.tanh(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
                                  # biases['decoder_b2']))
    return layer_1

# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X

# Define batch mse
batch_mse = tf.reduce_mean(tf.pow(y_true - y_pred, 2), 1)

# Define loss and optimizer, minimize the squared error
cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)

## Train and vallidate the Model

The training part above is straight forward. Every time we randomly sample a mini batch of size 256 from train_x, feed into model as input of X, and run the optimizer to update the parameters through SGD.

However, one thing worth highlighting here – we are using the same data for training as well as for validation! This is reflected in the line of: **"# Display logs per epoch step"**

This may seem counter-intuitive in the beginning, but since we are doing unsupervised training and the model never ‘see’ the labels during training, this will not lead to overfitting. This validation process is used for monitoring **‘early stopping’ as well as model hyper-parameter tuning**. 

In [None]:
# TRAIN StARTS
save_model = os.path.join(data_dir, 'temp_saved_model_1layer.ckpt')
saver = tf.train.Saver()

# Initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    now = datetime.now()
    sess.run(init)
    total_batch = int(train_x.shape[0]/batch_size)
    # Training cycle
    for epoch in range(training_epochs):
        # Loop over all batches
        for i in range(total_batch):
            batch_idx = np.random.choice(train_x.shape[0], batch_size)
            batch_xs = train_x[batch_idx]
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={X: batch_xs})
            
        # Display logs per epoch step
        if epoch % display_step == 0:
            train_batch_mse = sess.run(batch_mse, feed_dict={X: train_x})
            print("Epoch:", '%04d' % (epoch+1),
                  "cost=", "{:.9f}".format(c), 
                  "Train auc=", "{:.6f}".format(auc(train_y, train_batch_mse)), 
                  "Time elapsed=", "{}".format(datetime.now() - now))

    print("Optimization Finished!")
    
    save_path = saver.save(sess, save_model)
    print("Model saved in file: %s" % save_path)

### Test model - on later 25% test data

We have finalized our model and hyper-parameters, we can evaluate its performance on our separate test_x data set, which is shown in codes below (test_batch_mse is our fraud scores for test data) :

We obtained an AUC of around **0.947**

In [None]:
save_model = os.path.join(data_dir, 'temp_saved_model_1layer.ckpt')
saver = tf.train.Saver()

# Initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    now = datetime.now()
    
    saver.restore(sess, save_model)
    
    test_batch_mse = sess.run(batch_mse, feed_dict={X: test_x})
    
    print("Test AUC score: {:.6f}".format(auc(test_y, test_batch_mse)))

## Visualize the Prediction Result


In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(test_y, test_batch_mse)

fpr_micro, tpr_micro, _ = roc_curve(test_y, test_batch_mse)
roc_auc = auc(fpr_micro, tpr_micro)

plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve on val data set')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, th = precision_recall_curve(test_y, test_batch_mse)
plt.plot(recall, precision, 'b', label='Precision-Recall curve')
plt.title('Recall vs Precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
plt.plot(th, precision[1:], 'b', label='Threshold-Precision curve')
plt.title('Precision for different threshold values')
plt.xlabel('Threshold')
plt.ylabel('Precision')
plt.show()

In [None]:
plt.plot(th, recall[1:], 'b', label='Threshold-Recall curve')
plt.title('Recall for different threshold values')
plt.xlabel('Reconstruction error')
plt.ylabel('Recall')
plt.show()

In [None]:
#plt.hist(test_cost[np.where(test_y == 1)], bins = 100)
plt.title('Fraud score (mse) distribution of val set')
plt.xlabel('Fraud score')
plt.ylabel('Probabilties')
plt.hist(test_batch_mse[(test_y == 0) & (test_batch_mse < 1000)], bins = 100, color='green', density=True, label='Non-Fraud')
plt.hist(test_batch_mse[(test_y == 1) & (test_batch_mse < 1000)], bins = 100, color='red', density=True, label = 'Fraud')

plt.legend(loc="upper right")
plt.show()

#### 1. Display fraud score (mse) distribution for Fraud cases

In [None]:
#plt.hist(test_cost[np.where(test_y == 1)], bins = 100)
plt.hist(test_batch_mse[(test_y == 1) & (test_batch_mse < 500)], bins = 100, color='red', density=True)
plt.title('Fraud score (mse) distribution of Fraud cases')
plt.xlabel('Fraud score')
plt.ylabel('Probabilities')
plt.show()

Zoom into (0, 20) range

#### 2. Display fraud score (mse) distribution for Non-fraud cases

In [None]:
#plt.hist(test_cost[np.where(test_y == 0)], bins = 100)
plt.hist(test_batch_mse[(test_y == 0) & (test_batch_mse < 500)], bins = 100, color='green', density=True)
plt.title('Fraud score (mse) distribution of non-fraud cases')
plt.xlabel('Fraud score')
plt.ylabel('Probabilities')
plt.show()

## Threshold Test

Threshold Test value = 7

### Observation 

Our precision increased by a factor of 60 from 0.132% to 7.86%; However, the detection precision is still low (below 8%), but this is mainly due to the overall percentage of fraud cases is really too low. 

In [None]:
THRE_TEST = 7
print("Let's, for example, use 7 as our detection threshold: \n\
Number of detected cases above treshold: {}, \n\
Number of pos cases only above threshold: {}, \n\
The percentage of accuracy above treshold (Precision): {:0.2f}%. \n\
Compared to the average percentage of fraud in test set: 0.132%".format( \
np.sum(test_batch_mse > THRE_TEST), \
np.sum(test_y[test_batch_mse > THRE_TEST]), \
np.sum(test_y[test_batch_mse > THRE_TEST]) / np.sum(test_batch_mse > THRE_TEST) * 100))
      

# Build a Binary Classifier that Predicts Fraud 

Using the Autoencoder embedding layers as Model Inputs, we will build a Fully Connected(FC) Feedforward Neural Network Model 

Until now we have covered all the necessary steps to train an autoencoder and make predictions on test data. However, the next thing I will do is ‘pre-train’ our data set from scratch using autoencoder, fetch out the embedding layers, and feed those embeddings to a FC Feedforward Neural Network that will do the task of binary classification.

The rationale is simple – since our autoencoder is able to differentiate between frauds and non-frauds, the lower dimensional features it’s derived (the embedding layer) during training should include some useful latent features that would help the task of fraud classification, or at least, it should speed up classifier’s learning process, compared to letting it adapt to the raw features from scratch.

First, let’s fetch out the embedding layer for all data set(i.e both train and test data). It is the **encoder_op**. Tensor ops which we can get by using **sess.run**.

In [None]:
save_model = os.path.join(data_dir, 'temp_saved_model_1layer.ckpt')
saver = tf.train.Saver()

# Initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    now = datetime.now()
    saver.restore(sess, save_model)
    
    test_encoding = sess.run(encoder_op, feed_dict={X: test_x})
    train_encoding = sess.run(encoder_op, feed_dict={X: train_x})
    
    print("Dim for test_encoding and train_encoding are: \n", test_encoding.shape, '\n', train_encoding.shape)

### Build the graph for FC layers 

Second, we build the graph for FC feed-forward neural network as follows (again, you could use validation to fine tune hyper-parameters such as hidden layer numbers and sizes):

Best hidden size based on validation is found to be 4

In [None]:
#n_input = test_encoding.shape[1]
n_input = test_encoding.shape[1]

hidden_size = 4
output_size = 2

X = tf.placeholder(tf.float32, [None, n_input], name='input_x')
y_ = tf.placeholder(tf.int32, shape=[None, output_size], name='target_y')

weights = {
    'W1': tf.Variable(tf.truncated_normal([n_input, hidden_size])),
    'W2': tf.Variable(tf.truncated_normal([hidden_size, output_size])),
}
biases = {
    'b1': tf.Variable(tf.zeros([hidden_size])),
    'b2': tf.Variable(tf.zeros([output_size])),
}

hidden_layer =  tf.nn.relu(tf.add(tf.matmul(X, weights['W1']), biases['b1']))
pred_logits = tf.add(tf.matmul(hidden_layer, weights['W2']), biases['b2'])
pred_probs = tf.nn.softmax(pred_logits)

cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=pred_logits))

optimizer = tf.train.AdamOptimizer(2e-4).minimize(cross_entropy)


### Prepare the data set. 

We will prepare our data set, and train the model while monitoring its validation scores. 
Here, we will further split our **train_encoding** into **train_enc_x** and **val_enc_x**, each taking up **80**% and **20**% of the previous train_encoding , respectively. 
As a typical supervised training approach, we will use **train_enc_x** as our **training data** and **val_enc_x** as **validation data**.

We will therefore use 80% of out previous training data as our new training, and the remaining 20% as new validation.


In [None]:
n_epochs = 80
batch_size = 256

# PREPARE DATA
VAL_PERC = 0.2
all_y_bin = np.zeros((data_fdp.shape[0], 2))
all_y_bin[range(data_fdp.shape[0]), data_fdp['Class'].values] = 1

train_enc_x = train_encoding[:int(train_encoding.shape[0] * (1-VAL_PERC))]
train_enc_y = all_y_bin[:int(train_encoding.shape[0] * (1-VAL_PERC))]

val_enc_x = train_encoding[int(train_encoding.shape[0] * (1-VAL_PERC)):]
val_enc_y = all_y_bin[int(train_encoding.shape[0] * (1-VAL_PERC)):train_encoding.shape[0]]

test_enc_y = all_y_bin[train_encoding.shape[0]:]
print("Num of data for train, val and test are: \n{}, \n{}, \n{}".format(train_enc_x.shape[0], val_enc_x.shape[0], \
                                                                        test_encoding.shape[0]))

### Train Binary Classifier

In [None]:
from sklearn.metrics import roc_auc_score as auc 

# TRAIN STARTS
save_model = os.path.join(data_dir, 'temp_saved_model_FCLayers.ckpt')
saver = tf.train.Saver()

# Initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    now = datetime.now()
    sess.run(init)
    total_batch = int(train_enc_x.shape[0]/batch_size)
    # Training cycle
    for epoch in range(n_epochs):
        # Loop over all batches
        for i in range(total_batch):
            batch_idx = np.random.choice(train_enc_x.shape[0], batch_size)
            batch_xs = train_enc_x[batch_idx]
            batch_ys = train_enc_y[batch_idx]

            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cross_entropy], feed_dict={X: batch_xs, y_: batch_ys})
            
        # Display logs per epoch step
        if epoch % display_step == 0:
            val_probs = sess.run(pred_probs, feed_dict={X: val_enc_x})
            print("Epoch:", '%04d' % (epoch+1),
                  "cost=", "{:.9f}".format(c), 
                  "Val auc=", "{:.6f}".format(auc(val_enc_y[:, 1], val_probs[:, 1])), 
                  "Time elapsed=", "{}".format(datetime.now() - now))

    print("Optimization Finished!")
    
    save_path = saver.save(sess, save_model)
    print("Model saved in file: %s" % save_path)

### Test the Model 

We have finalized our model, now we will evaluate its performance on the same test data as before,

We obtained a slightly improved AUC of around **0.96**

In [None]:
save_model = os.path.join(data_dir, 'temp_saved_model_FCLayers.ckpt')
saver = tf.train.Saver()
# Initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    now = datetime.now()
    
    saver.restore(sess, save_model)
    
    test_probs = sess.run(pred_probs, feed_dict={X: test_encoding})
    
    print("\nTest auc score: {}".format(auc(test_enc_y[:, 1], test_probs[:, 1])))

## Visualize the Prediction Result


In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(test_enc_y[:, 1], test_probs[:, 1])

fpr_micro, tpr_micro, _ = roc_curve(test_enc_y[:, 1], test_probs[:, 1])
roc_auc = auc(fpr_micro, tpr_micro)

plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve on val data set')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, th = precision_recall_curve(test_y, test_batch_mse)
plt.plot(recall, precision, 'b', label='Precision-Recall curve')
plt.title('Recall vs Precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
plt.plot(th, precision[1:], 'b', label='Threshold-Precision curve')
plt.title('Precision for different threshold values')
plt.xlabel('Threshold')
plt.ylabel('Precision')
plt.show()

In [None]:
plt.plot(th, recall[1:], 'b', label='Threshold-Recall curve')
plt.title('Recall for different threshold values')
plt.xlabel('Reconstruction error')
plt.ylabel('Recall')
plt.show()

In [None]:
#plt.hist(test_cost[np.where(test_y == 1)], bins = 100)
plt.title('Fraud score(mse) distribution of val set')
plt.xlabel('Fraud score')
plt.ylabel('Probabilties')
plt.hist(test_probs[(test_enc_y == 0) & (test_probs < 500)], bins = 100, color='green', density=True, label='Non-Fraud')
plt.hist(test_probs[(test_enc_y == 1) & (test_probs < 500)], bins = 100, color='red', density=True, label = 'Fraud')

plt.legend(loc="upper right")
plt.show()

#### 1. Display fraud score (mse) distribution for non-fraud cases

In [None]:
#plt.hist(test_cost[np.where(test_y == 1)], bins = 100)
plt.hist(test_probs[(test_enc_y == 1) & (test_probs < 500)], bins = 100, color='red', density=True)
plt.title('Fraud score (mse) distribution of Fraud cases')
plt.xlabel('Fraud score')
plt.ylabel('Probabilities')
plt.show()

Zoom into (0, 20) range

#### 2. Display fraud score (mse) distribution for fraud cases

In [None]:
#plt.hist(test_cost[np.where(test_y == 1)], bins = 100)
plt.hist(test_probs[(test_enc_y == 0) & (test_probs < 500)], bins = 100, color='green', density=True)
plt.title('Fraud score (mse) distribution of Non-fraud cases')
plt.xlabel('Fraud score')
plt.ylabel('Probabilities')
plt.show()