# Credit Card Fraud Detection Project
*By: Herman Lin and Mahika Jain*
---
blah

In [43]:
# Importing the libraries to be used:
import sklearn
from sklearn import preprocessing, svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier

import numpy as np
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy.polynomial.polynomial as poly
%matplotlib inline

# Constants used throughout
cVals = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]

In [13]:
# Read .csv file and put data into a pandas dataframe
df = pd.read_csv('archive.zip')
print(df.shape)

(284807, 31)


In [14]:
# Drop empty columns
df1 = df.dropna('columns')
print(df1.shape)

(284807, 31)


In [15]:
# Convert dataframe into a numpy array
df2 = np.array(df1)
print(df2.shape)

(284807, 31)


In [16]:
# Printing the names of all the features
# - Note: Most feature names have been anonymized to preserve confidentiality
features = np.array(df.columns[:30])
print('Feature Names:', features)

Feature Names: ['Time' 'V1' 'V2' 'V3' 'V4' 'V5' 'V6' 'V7' 'V8' 'V9' 'V10' 'V11' 'V12'
 'V13' 'V14' 'V15' 'V16' 'V17' 'V18' 'V19' 'V20' 'V21' 'V22' 'V23' 'V24'
 'V25' 'V26' 'V27' 'V28' 'Amount']


### Note:

The credit card dataset we are using for this project is naturally unbalanced. There are significantly more examples that are classified as non-fradulent than there are fradulent. One way to help counteract this is to undersample the majority class and oversample the minority class. Thus, we will be scaling our training set to contain a ratio of 5:1 non-fradulent to fradulent as well as using a fraction of the original dataset as our training and validation sets.

In [17]:
# Separate the data into class_0 and class_1 examples
zero = []
one = []
num_examples = df2.shape[0]

for i in range(num_examples):
    if df2[i][30] == 0:
        zero.append(df2[i])
    else: 
        one.append(df2[i])

class_0 = np.array(zero)
class_1 = np.array(one)

In [18]:
# Verify shapes of class_0 and class_1
print('Number of Class 0:', class_0.shape[0])
print('Number of Class 1:', class_1.shape[0])

Number of Class 0: 284315
Number of Class 1: 492


In [19]:
# Randomly choose 2460 examples from the entire class_0 set
class_0_reduced = class_0[np.random.choice(284315, 2460, replace=False),:] 

In [20]:
# Combine samples together and randomize the samples
reduced_data = np.concatenate((class_0_reduced, class_1))
np.random.shuffle(reduced_data)
print(reduced_data.shape)

(2952, 31)


In [21]:
# Separate columns into features and target
X = np.array(reduced_data[:,0:30]) # all rows, first 30 columns
y = np.array(reduced_data[:,30]) # all rows, last column

# sklearn Model Implementation

We have created a function for easy model testing of the data. By specifying certain parameters, we are able to run either a Logisitic Regression Model (with different regularization methods) or an SVM Model (with different kernels).

In [45]:
#
#           ====================
#           TEST DEGREES LATER!! (for polynomial kernels)
#           ====================
#

def sklearn_model(X_tr, y_tr, X_ts, y_ts, m_type, c, iters, penalty='none', kernel=None, hidden_layer_sizes=None, activation=None, alpha=0):
    acc_tr_model = []
    acc_ts_model = []
    c_model = []
    model = None

    # create model 
    if m_type == 0:
        model = LogisticRegression(penalty=penalty, C=c, solver='saga', max_iter=iters)
        print('Training Logistic Regression Model...')
    elif m_type == 1:
        model = svm.SVC(probability=True, kernel=kernel, C=c)
        print('Training SVM Model...')
    
    # fit the model
    model.fit(X_tr, y_tr)

    # find the prediction on the training and testing set
    yhat_tr = model.predict_proba(X_tr)
    yhat_ts = model.predict_proba(X_ts)

    # calculate and add accuracy values to respective lists
    acc_tr = model.score(X_tr, y_tr)
    acc_tr_model.append(acc_tr)
    print("Accuracy on training data = %f" % acc_tr)
    acc_ts = model.score(X_ts, y_ts)
    acc_ts_model.append(acc_ts)
    print("Accuracy on test data = %f" % acc_ts)

    # appending value of c for graphing purposes if needed
    c_model.append(c)

    # creating a confusion matrix for analysis
    confuse_matrix_tr = confusion_matrix(y_tr, model.predict(X_tr))
    class_report_tr = classification_report(y_tr, model.predict(X_tr))
    confuse_matrix_ts = confusion_matrix(y_ts, model.predict(X_ts))
    class_report_ts = classification_report(y_ts, model.predict(X_ts))

    return (acc_tr_model, acc_ts_model, c_model), (confuse_matrix_tr, confuse_matrix_ts), (class_report_tr, class_report_ts)

# sklearn Neural Network Model Implementation

In [None]:
def neural_network():
    

# Logisitic Regression


### Step 1: Data Preprocessing

In [23]:
# Scale the data by preprocessing
# - The idea behind StandardScaler is that it will transform your data such that 
#   its distribution will have a mean value 0 and standard deviation of 1.
# - Mean Subtraction: for every feature subtract the mean
#   Normalization: make all features roughly the same size
#       X’ = (x-mean)/std

X_scale = preprocessing.StandardScaler().fit_transform(X)

In [27]:
# Split the reduced_data into the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.3, random_state=133)

In [28]:
print('\n======================Before StandardScalar==========================')
print(X)
print('\n=======================After StandardScalar==========================')
print(X_scale)


[[ 4.68810000e+04 -1.84264628e+00  1.59061918e+00 ... -8.91202286e-01
  -3.02532802e-01  1.79900000e+01]
 [ 1.45405000e+05 -5.13686821e-01 -3.63866420e-01 ...  9.82829020e-03
   4.01364699e-02  1.50000000e+01]
 [ 7.95860000e+04  1.00499536e+00 -3.57895870e-01 ...  7.73105736e-02
   1.87041617e-02  3.90500000e+01]
 ...
 [ 1.44932000e+05  1.73423535e+00 -8.39642643e-01 ...  3.64383401e-02
  -6.77362031e-03  1.22000000e+02]
 [ 6.42510000e+04 -8.95225867e-01  1.64295678e+00 ... -1.49201063e-01
   3.64295229e-02  1.00000000e+00]
 [ 1.31024000e+05  4.69749517e-01 -1.23755547e+00 ... -1.17857669e-01
   1.44774181e-01  7.23210000e+02]]

[[-0.97669032 -0.28735855  0.36512541 ... -1.43392324 -1.04110483
  -0.3460759 ]
 [ 1.09178882  0.06890657 -0.37523487 ... -0.03318204  0.09100576
  -0.36068779]
 [-0.29005955  0.47603224 -0.37297322 ...  0.07172585  0.02019771
  -0.24315739]
 ...
 [ 1.08185834  0.67152562 -0.55545917 ...  0.00818592 -0.06397578
   0.16221256]
 [-0.61201286 -0.03337575  0.3849

### Step 2: Create and run the logistic regression model

In [29]:
# Perform Logisitic Regression with no Regularization
accs_c,confusion_matrices, class_reports = sklearn_model(X_train, y_train, X_test, y_test, 0, 100000000, 10000)

Training Logistic Regression Model...
  "Setting penalty='none' will ignore the C and l1_ratio "
Accuracy on training data = 0.980639
Accuracy on test data = 0.972912


In [36]:
# write results into logreg_results.txt
with open("logreg_results.txt", 'a') as file:
    file.write("\n\n==========================\n")
    file.write("Accuracy on training data = {}\n".format(accs_c[0]))
    file.write("Accuracy on testing data = {}\n".format(accs_c[1]))
    file.write("=== Confusion Matrices ===\n")
    file.write("Training: \n{}\n".format(confusion_matrices[0]))
    file.write("Testing: \n{}\n".format(confusion_matrices[1]))
    file.write("=== Class Reports ===\n")
    file.write("Training: \n{}\n".format(class_reports[0]))
    file.write("Testing: \n{}\n".format(class_reports[1]))

### Step 3: Plot results from the logistic regression models

# Support Vector Machines

### Step 1: Run SVM with a linear kernel

In [47]:
svm_linear_results = []
for c in cVals:
    svm_linear_results.append((sklearn_model(X_train, y_train, X_test, y_test, 1, c, 0, kernel='linear')))

Training SVM Model...
Accuracy on training data = 0.894966
Accuracy on test data = 0.881490
Training SVM Model...
Accuracy on training data = 0.968054
Accuracy on test data = 0.954853
Training SVM Model...
Accuracy on training data = 0.970958
Accuracy on test data = 0.965011
Training SVM Model...
Accuracy on training data = 0.979671
Accuracy on test data = 0.970655
Training SVM Model...
Accuracy on training data = 0.980639
Accuracy on test data = 0.971783
Training SVM Model...
Accuracy on training data = 0.980639
Accuracy on test data = 0.971783
Training SVM Model...
Accuracy on training data = 0.980639
Accuracy on test data = 0.971783


### Step 2: Run SVM with a radial basis function kernel

In [48]:
svm_rbf_results = []
for c in cVals:
    svm_rbf_results.append((sklearn_model(X_train, y_train, X_test, y_test, 1, c, 0, kernel='rbf')))

Training SVM Model...
Accuracy on training data = 0.834463
Accuracy on test data = 0.830700
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Training SVM Model...
Accuracy on training data = 0.834463
Accuracy on test data = 0.830700
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Training SVM Model...
Accuracy on training data = 0.836399
Accuracy on test data = 0.830700
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_p

### Step 3: Run SVM with a polynomial kernel

In [50]:
cValsGeom = np.geomspace(0.000001, 0.001, 5)
svm_poly_results = []
for c in cValsGeom:
    svm_poly_results.append((sklearn_model(X_train, y_train, X_test, y_test, 1, c, 0, kernel='poly')))

Training SVM Model...
Accuracy on training data = 0.834463
Accuracy on test data = 0.830700
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Training SVM Model...
Accuracy on training data = 0.834947
Accuracy on test data = 0.831828
Training SVM Model...
Accuracy on training data = 0.844143
Accuracy on test data = 0.840858
Training SVM Model...
Accuracy on training data = 0.876089
Accuracy on test data = 0.861174
Training SVM Model...
Accuracy on training data = 0.876089
Accuracy on test data = 0.861174


# Neural Networks