# Credit Card Fraud Detection Project
*By: Herman Lin and Mahika Jain*
---

In [None]:
# Importing the libraries to be used:
import sklearn
from sklearn import preprocessing, svm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, plot_precision_recall_curve, average_precision_score, precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier

import numpy as np
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy.polynomial.polynomial as poly
%matplotlib inline

# Constants used throughout the code
cVals = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]

In [None]:
# Read .csv file and put data into a pandas dataframe
df = pd.read_csv('archive.zip')
print(df.shape)

In [None]:
# Drop empty columns
df1 = df.dropna('columns')
print(df1.shape)

In [None]:
# Convert dataframe into a numpy array
df2 = np.array(df1)
print(df2.shape)

In [None]:
# Printing the names of all the features
# - Note: Most feature names have been anonymized to preserve confidentiality
features = np.array(df.columns[:30])
print('Feature Names:', features)

### Note:

The credit card dataset we are using for this project is naturally unbalanced. There are significantly more examples that are classified as non-fradulent than there are fradulent. One way to help counteract this is to undersample the majority class and oversample the minority class. Thus, we will be scaling our training set to contain a ratio of 5:1 non-fradulent to fradulent as well as using a fraction of the original dataset as our training and validation sets.

In [None]:
# Separate the data into class_0 and class_1 examples
zero = []
one = []
num_examples = df2.shape[0]

for i in range(num_examples):
    if df2[i][30] == 0:
        zero.append(df2[i])
    else: 
        one.append(df2[i])

class_0 = np.array(zero)
class_1 = np.array(one)

In [None]:
# Verify shapes of class_0 and class_1
print('Number of Class 0:', class_0.shape[0])
print('Number of Class 1:', class_1.shape[0])

In [None]:
# Randomly choose 2460 examples from the entire class_0 set
class_0_reduced = class_0[np.random.choice(284315, 2460, replace=False),:] 

In [None]:
# Combine samples together and randomize the samples
reduced_data = np.concatenate((class_0_reduced, class_1))
np.random.shuffle(reduced_data)
print(reduced_data.shape)

In [None]:
# Separate columns into features and target
X = np.array(reduced_data[:,0:30]) # all rows, first 30 columns
y = np.array(reduced_data[:,30]) # all rows, last column

# sklearn Model Implementation

We have created a function for easy model testing of the data. By specifying certain parameters, we are able to run either a Logisitic Regression Model (with different regularization methods) or an SVM Model (with different kernels).

In [None]:
def sklearn_model(X_tr, y_tr, X_ts, y_ts, m_type, c, iters, penalty='none', kernel=None, hidden_layer_sizes=None, activation=None, alpha=0.0001):
    acc_tr_model = []
    acc_ts_model = []
    c_model = []
    model = None

    # create model 
    if m_type == 0:
        print('Training Logistic Regression Model...')
        model = LogisticRegression(penalty=penalty, C=c, solver='saga', max_iter=iters)
    elif m_type == 1:
        print('Training SVM Model...')
        model = svm.SVC(probability=True, kernel=kernel, C=c)
    elif m_type == 2:
        print('Training Neural Network...')
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=iters)
    
    # fit the model
    model.fit(X_tr, y_tr)

    # yhat values
    yhat_tr = model.predict(X_tr)
    yhat_ts = model.predict(X_ts)

    # calculate and add accuracy values to respective lists
    acc_tr = model.score(X_tr, y_tr)
    acc_tr_model.append(acc_tr)
    #print("Accuracy on training data = %f" % acc_tr)
    acc_ts = model.score(X_ts, y_ts)
    acc_ts_model.append(acc_ts)
    #print("Accuracy on test data = %f" % acc_ts)

    # appending value of c for graphing purposes if needed
    c_model.append(c)

    # creating a classification report for analysis
    class_report_tr = classification_report(y_tr, yhat_tr, output_dict=True) 
    class_report_ts = classification_report(y_ts, yhat_ts, output_dict=True)
    
    print("Model Complete!")

    return (acc_tr_model, acc_ts_model, c_model), (class_report_tr, class_report_ts), model

# Data Displaying

Convert the return values obtained from the sklearn_model function into readable tables

In [None]:
def results_to_dataframe(model_set):
    training_acc = []
    testing_acc = []
    cVals = []
    for model in model_set:
        # grab accuracies for each model
        training_acc.append(model[0][0][0])
        testing_acc.append(model[0][1][0])
        cVals.append(model[0][2][0])

    # convert accuracies into dictionaries
    tr_dict, ts_dict = {}, {}
    i = 0
    for acc in training_acc:
        tr_dict[cVals[i]] = acc
        i += 1
    i = 0
    for acc in testing_acc:
        ts_dict[cVals[i]] = acc
        i += 1

    c_accs = {}
    c_accs['Acc_tr'] = tr_dict
    c_accs['Acc_ts'] = ts_dict

    # convert accuracy dict into DataFrame
    c_acc_df = pd.DataFrame(c_accs) 

    # grab only precision, recall, fscore, support from the classification_report
    prfs_lst = []
    for model in model_set:
        prfs_lst.append({k: model[1][0][k] for k in ('0.0', '1.0')})

    return c_acc_df, prfs_lst

# Plot Displaying

Create plots for the Precision-Recall curve and the Accuracy versus C Values curve for the models

In [None]:
def show_plots(title, model_set, X_test, y_test, acc_c_plot=True):
    fig, ax = plt.subplots(1, 2, figsize=(16, 6))
    fig.subplots_adjust(wspace=0.25)
    fig.suptitle(title)
    cVals = []

    for model in model_set:
        plot_precision_recall_curve(model[2], X_test, y_test, ax=ax[0], name='cVal: {}'.format(model[0][2][0]))
        cVals.append(model[0][2][0])
    ax[0].set_title('Precision-Recall Curve')
    #ax[0].legend(loc='center')

    if acc_c_plot:
        train = []
        test = []
        for model in model_set:
            train.append(model[0][0])
            test.append(model[0][1])
        ax[1].plot(cVals, train,'.r-', label='Training Accuracy')
        ax[1].plot(cVals, test,'.b-', label='Test Accuracy')
        ax[1].set_title('Accuracy vs C Values')
        ax[1].set_xlabel('C Value')
        ax[1].set_ylabel('Accuracy')
        ax[1].set_xscale('log')
        ax[1].legend()

# Logisitic Regression


### Step 1: Data Preprocessing

In [None]:
# Scale the data by preprocessing
# - The idea behind StandardScaler is that it will transform your data such that 
#   its distribution will have a mean value 0 and standard deviation of 1.
# - Mean Subtraction: for every feature subtract the mean
#   Normalization: make all features roughly the same size
#       X’ = (x-mean)/std

X_scale = preprocessing.StandardScaler().fit_transform(X)

In [None]:
# Split the reduced_data into the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.3, random_state=133)

In [None]:
print('\n======================Before StandardScalar==========================')
print(X)
print('\n=======================After StandardScalar==========================')
print(X_scale)

### Step 2: Create and run the logistic regression model

In [None]:
# Perform Logisitic Regression with no Regularization
# Pass a large value of C (for example, C = 100000000) to make lambda (C = 1/lambda) nearly 0.
logreg = [] # to appease the gods in results_to_dataframe()
logreg.append(sklearn_model(X_train, y_train, X_test, y_test, 0, 100000000, 10000))

# For logreg with regularization: 
# Pass the value of C = c. Note that C is the inverse of lambda. So, small value of C i.e. b/w 0 and 1 
# means stronger regularization and large value means less regularization.

# Perform Logistic Regression with L1 Regularization
logreg_l1 = []
for c in cVals:
    logreg_l1.append((sklearn_model(X_train, y_train, X_test, y_test, 0, c, 10000, 'l1')))

# Perform Logistic Regression with L2 Regularization
logreg_l2 = []
for c in cVals:
    logreg_l2.append((sklearn_model(X_train, y_train, X_test, y_test, 0, c, 10000, 'l2')))


### Step 3: Display data from the logistic regression models

In [None]:
titles = ["\n========== Logistic Regression without Regularization ==========",
          "\n========== Logistic Regression with L1 Regularization ==========",
          "\n========== Logistic Regression with L2 Regularization =========="]

In [None]:
i = 0
for model_set in (logreg, logreg_l1, logreg_l2):
    print(titles[i])
    c_acc, prfs_lst = results_to_dataframe(model_set)
    display(c_acc)

    cVals = [model[0][2][0] for model in model_set]

    j = 0
    for prfs in prfs_lst:
        print("\ncVal: {}".format(cVals[j]))
        prfs_df = pd.DataFrame(prfs)
        display(prfs_df)
        j += 1

    i += 1

### Step 4: Plot results from the logistic regression models

We will be plotting the results using Precision-Recall Curves since we are dealing with a large class imbalance

### Plot PR Curve for Logistic Regression without Regularization

In [None]:
show_plots("LogReg no Regularization: PR Curve", logreg, X_test, y_test, False)

### Plot PR Curve and Acc vs C Curve for Logistic Regression with L1 Regularization

In [None]:
show_plots('Logistic Regression with L1 Regularization', logreg_l1, X_test, y_test)

### Plot PR Curve for Logistic Regression with L2 Regularization

In [None]:
show_plots('Logistic Regression with L2 Regularization', logreg_l2, X_test, y_test)

### Logistic Regression with Polynomial Feature Transformation

In order to explore more options, we will see what the effect of Polynomial Feature Transformation will have on a logistic regression model.

### Step 1: Perform Polynomial Feature Transformation

In [None]:
poly_d2 = PolynomialFeatures(degree=2)

In [None]:
X_tr_transd2 = poly_d2.fit_transform(X_train)
X_ts_transd2 = poly_d2.fit_transform(X_test)

In [None]:
# Verify that the data has been transformed
print(X_train.shape)
print(X_tr_transd2.shape)

### Step 2: Train the Logistic Regression models using the polynomial feature transformed data

In [None]:
# Polynomial Feature Transformation of Degree 2
# Logistic Regression with no Regularization
logreg_transd2 = []
logreg_transd2.append(sklearn_model(X_tr_transd2, y_train, X_ts_transd2, y_test, 0, 10000000, 10000))

# Logistic Regression with L1 Regularization
logreg_l1_transd2 = []
for c in cVals:
    logreg_l1_transd2.append(sklearn_model(X_tr_transd2, y_train, X_ts_transd2, y_test, 0, c, 10000, 'l1'))

# Logistic Regression with L2 Regularization
logreg_l2_transd2 = []
for c in cVals:
    logreg_l2_transd2.append(sklearn_model(X_tr_transd2, y_train, X_ts_transd2, y_test, 0, c, 10000, 'l2'))

### Step 3: Display data from the new Logistic Regression models with PFTransformations

In [None]:
titles = ['\n========== LogReg without Regularization: PFT of degree 2 ==========',
          '\n========== LogReg with L1 Regularization: PFT of degree 2 ==========',
          '\n========== LogReg with L2 Regularization: PFT of degree 2 ==========']

In [None]:
i = 0
for model_set in (logreg_transd2, logreg_l1_transd2, logreg_l2_transd2):
    print(titles[i])
    c_acc, prfs_lst = results_to_dataframe(model_set)
    display(c_acc)

    cVals = [model[0][2][0] for model in model_set]

    j = 0
    for prfs in prfs_lst:
        print("\ncVal: {}".format(cVals[j]))
        prfs_df = pd.DataFrame(prfs)
        display(prfs_df)
        j += 1

    i += 1

### Step 4: Plot results from the new Logsitic Regression Models with PFTransformations

In [None]:
show_plots('LogReg without Regularization: PFT of degree 2', logreg_transd2, X_ts_transd2, y_test, False)

In [None]:
show_plots('LogReg with L1 Regularization: PFT of degree 2', logreg_l1_transd2, X_ts_transd2, y_test)

In [None]:
show_plots('LogReg with L2 Regularization: PFT of degree 2', logreg_l2_transd2, X_ts_transd2, y_test)

# Support Vector Machines

### Step 1: Run SVM with each kernel

In [None]:
# SVM with Linear Kernel
svm_linear_results = []
for c in cVals:
    svm_linear_results.append((sklearn_model(X_train, y_train, X_test, y_test, 1, c, 0, kernel='linear')))

In [None]:
# SVM with Radial Basis Function Kernel
svm_rbf_results = []
for c in cVals:
    svm_rbf_results.append((sklearn_model(X_train, y_train, X_test, y_test, 1, c, 0, kernel='rbf')))

In [None]:
# SVM with Polynomial Kernel
cValsGeom = np.geomspace(0.000001, 0.001, 7)
svm_poly_results = []
for c in cValsGeom:
    svm_poly_results.append((sklearn_model(X_train, y_train, X_test, y_test, 1, c, 0, kernel='poly')))

### Step 2: Display data obtained from the SVMs

In [None]:
titles = ['\n========== SVM with Linear Kernel ==========',
          '\n========== SVM with RBF Kernel ==========',
          '\n========== SVM with Polynomial Kernel ==========']

In [None]:
i = 0
for model_set in (svm_linear_results, svm_rbf_results, svm_poly_results):
    print(titles[i])
    c_acc, prfs_lst = results_to_dataframe(model_set)
    display(c_acc)

    cVals = [model[0][2][0] for model in model_set]

    j = 0
    for prfs in prfs_lst:
        print("\ncVal: {}".format(cVals[j]))
        prfs_df = pd.DataFrame(prfs)
        display(prfs_df)
        j += 1

    i += 1

### Step 3: Plot the results obtained from the SVM models

In [None]:
# linear kernel
show_plots("SVM with Linear Kernel", svm_linear_results, X_test, y_test)

In [None]:
# radial basis function kernel
show_plots("SVM with RBF Kernel", svm_rbf_results, X_test, y_test)

In [None]:
# polynomial kernel
show_plots("SVM with Polynomial Kernel", svm_poly_results, X_test, y_test)

# Neural Networks

### Step 1: Run Neural Networks

In [None]:
relu_results = []
tanh_results = []
logi_results = []

activations = ['relu', 'tanh', 'logsitic']
iters = 10000 # sklearn's MLPClassifier halts further iterations after the training converges
layers = [(22), (22, 22), (22, 22, 22), (30)]
alphas = [0.01, 0.001, 0.0001, 0.00001] # L2 penalty (regularization term) parameter 

In [None]:
for l in layers:
    for a in alphas:
        print("layers: {0}, alpha: {1}".format(l, a))
        relu_results.append(sklearn_model(X_train, y_train, X_test, y_test, 2, 0, iters, hidden_layer_sizes=l, activation='relu', alpha=a))
        tanh_results.append(sklearn_model(X_train, y_train, X_test, y_test, 2, 0, iters, hidden_layer_sizes=l, activation='tanh', alpha=a))
        logi_results.append(sklearn_model(X_train, y_train, X_test, y_test, 2, 0, iters, hidden_layer_sizes=l, activation='logistic', alpha=a))

### Step 2: Display data obtained from the neural networks run

In [None]:
def nn_results_to_dataframe(model_set):
    training_acc = []
    testing_acc = []
    for model in model_set:
        # grab accuracies for each model
        training_acc.append(model[0][0][0])
        testing_acc.append(model[0][1][0])

    tr_dict, ts_dict = {}, {}
    i = 0
    for l in layers:
        for a in alphas:
            tr_dict["layers={0}, alpha={1}".format(l, a)] = training_acc[i]
            ts_dict["layers={0}, alpha={1}".format(l, a)] = testing_acc[i]
            i += 1

    c_accs = {}
    c_accs['Acc_tr'] = tr_dict
    c_accs['Acc_ts'] = ts_dict

    c_acc_df = pd.DataFrame(c_accs)
    
    prfs_lst = []
    for model in model_set:
        prfs_lst.append({k: model[1][0][k] for k in ('0.0', '1.0')})

    return c_acc_df, prfs_lst

In [None]:
titles = ['ReLU', 'tanh', 'Logisitic/Sigmoid']

In [None]:
i = 0
for model_set in (relu_results, tanh_results, logi_results):
    print("\n========== Neural Network with {} Activation ==========".format(titles[i]))
    c_acc, prfs_lst = nn_results_to_dataframe(model_set)
    display(c_acc)

    j = 0
    for l in layers:
        for a in alphas:
            print("\nlayers={0}, alpha={1}".format(l, a))
            prfs_df = pd.DataFrame(prfs_lst[j])
            display(prfs_df)
            j += 1

    i += 1

### Step 3: Gather data from the costs of each neural network

sklearn's MLPClassifier has a loss curve attribute, which contains "The ith element in the list represents the loss at the ith iteration." 

In [None]:
# average costs for each neural network run
avg_cost_relu = [np.average(r[2].loss_curve_) for r in relu_results]
avg_cost_tanh = [np.average(r[2].loss_curve_) for r in tanh_results]
avg_cost_logi = [np.average(r[2].loss_curve_) for r in logi_results]

In [None]:
# labels for the legends of each model's plots
labels = ["H_L={0}, a={1}".format(l, a) for l in layers for a in alphas]

In [None]:
def nn_show_plots(title, model_set, avg_cost):
    fig, ax = plt.subplots(1, 2, figsize=(16, 6))
    fig.subplots_adjust(wspace=1)
    fig.suptitle("Neural Network: {} Activation".format(title))

    for i in range(len(model_set)):
        plot_precision_recall_curve(model_set[i][2], X_test, y_test, ax=ax[0], name='{}'.format(labels[i]))
    ax[0].set_title('Precision-Recall Curve')
    ax[0].legend(bbox_to_anchor=(1, 1))

    label_avg_cost = ["{0}, Avg Cost: {1:2.2}".format(labels[i], avg_cost[i]) for i in range(16)]
    for result in model_set:
        ax[1].plot(result[2].loss_curve_)
        ax[1].legend(label_avg_cost, bbox_to_anchor=(2, 1))
    ax[1].set_title("Loss Curves")
    ax[1].set_xlabel('Iterations')
    ax[1].set_ylabel('Cost')

### Step 4: Plot results and loss curves obtained from the neural networks

In [None]:
# NN with ReLU
nn_show_plots("ReLU", relu_results, avg_cost_relu)

In [None]:
# NN with tanh
nn_show_plots("tanh", tanh_results, avg_cost_tanh)

In [None]:
# NN with logistic (sigmoid)
nn_show_plots("Logistic/Sigmoid", logi_results, avg_cost_logi)