# Credit Card Fraud Detection Project
*By: Herman Lin and Mahika Jain*
---
blah

In [69]:
# mporting the libraries to be used:
import sklearn
from sklearn import preprocessing, svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import numpy as np
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy.polynomial.polynomial as poly
%matplotlib inline

In [26]:
# Read .csv file and put data into a pandas dataframe
df = pd.read_csv('archive.zip')
print(df.shape)

(284807, 31)


In [27]:
# Drop empty columns
df1 = df.dropna('columns')
print(df1.shape)

(284807, 31)


In [28]:
# Convert dataframe into a numpy array
df2 = np.array(df1)
print(df2.shape)

(284807, 31)


In [29]:
# Printing the names of all the features
# - Note: Most feature names have been anonymized to preserve confidentiality
features = np.array(df.columns[:30])
print('Feature Names:', features)

Feature Names: ['Time' 'V1' 'V2' 'V3' 'V4' 'V5' 'V6' 'V7' 'V8' 'V9' 'V10' 'V11' 'V12'
 'V13' 'V14' 'V15' 'V16' 'V17' 'V18' 'V19' 'V20' 'V21' 'V22' 'V23' 'V24'
 'V25' 'V26' 'V27' 'V28' 'Amount']


### Note:

The credit card dataset we are using for this project is naturally unbalanced. There are significantly more examples that are classified as non-fradulent than there are fradulent. One way to help counteract this is to undersample the majority class and oversample the minority class. Thus, we will be scaling our training set to contain a ratio of 5:1 non-fradulent to fradulent as well as using a fraction of the original dataset as our training and validation sets.

In [30]:
# Separate the data into class_0 and class_1 examples
zero = []
one = []
num_examples = df2.shape[0]

for i in range(num_examples):
    if df2[i][30] == 0:
        zero.append(df2[i])
    else: 
        one.append(df2[i])

class_0 = np.array(zero)
class_1 = np.array(one)

In [31]:
# Verify shapes of class_0 and class_1
print('Number of Class 0:', class_0.shape[0])
print('Number of Class 1:', class_1.shape[0])

Number of Class 0: 284315
Number of Class 1: 492


In [32]:
# Randomly choose 2460 examples from the entire class_0 set
class_0_reduced = class_0[np.random.choice(284315, 2460, replace=False),:] 

In [33]:
# Combine samples together and randomize the samples
reduced_data = np.concatenate((class_0_reduced, class_1))
np.random.shuffle(reduced_data)
print(reduced_data.shape)

(2952, 31)


In [34]:
# Separate columns into features and target
X = np.array(reduced_data[:,0:30]) # all rows, first 30 columns
y = np.array(reduced_data[:,30]) # all rows, last column

# sklearn Model Implementation

We have created a function for easy model testing of the data. By specifying certain parameters, we are able to run either a Logisitic Regression Model (with different regularization methods) or an SVM Model (with different kernels).

In [70]:
#
#           ====================
#           TEST DEGREES LATER!! (for polynomial kernels)
#           ====================
#

def sklearn_model(X_tr, y_tr, X_ts, y_ts, m_type, c, iters, penalty='none', kernel=None):
    acc_tr_model = []
    acc_ts_model = []
    c_model = []
    model = None

    # create model 
    if m_type == 0:
        model = LogisticRegression(penalty=penalty, C=c, solver='saga',max_iter=iters)
        print('Training Logistic Regression Model...')
    elif m_type == 1:
        model = svm.SVC(kernel=kernel, C=c)
        print('Training SVM Model...')
    
    # fit the model
    model.fit(X_tr, y_tr)

    # find the prediction on the training and testing set
    yhat_tr = model.predict_proba(X_tr)
    yhat_ts = model.predict_proba(X_ts)

    # calculate and add accuracy values to respective lists
    acc_tr = model.score(X_tr, y_tr)
    acc_tr_model.append(acc_tr)
    print("Accuracy on training data = %f" % acc_tr)
    acc_ts = model.score(X_ts, y_ts)
    acc_ts_model.append(acc_ts)
    print("Accuracy on test data = %f" % acc_ts)

    # appending value of c for graphing purposes if needed
    c_model.append(c)

    # creating a confusion matrix for analysis
    confuse_matrix_tr = confusion_matrix(y_tr, model.predict(X_tr))
    class_report_tr = classification_report(y_tr, model.predict(X_tr))
    confuse_matrix_ts = confusion_matrix(y_ts, model.predict(X_ts))
    class_report_ts = classification_report(y_ts, model.predict(X_ts))

    return acc_tr_model, acc_ts_model, c_model, model, (confuse_matrix_tr, confuse_matrix_ts), (class_report_tr, class_report_ts)

# Logisitic Regression


### Step 1: Data Preprocessing

In [45]:
# Scale the data by preprocessing
# - The idea behind StandardScaler is that it will transform your data such that 
#   its distribution will have a mean value 0 and standard deviation of 1.
# - Mean Subtraction: for every feature subtract the mean
#   Normalization: make all features roughly the same size
#       X’ = (x-mean)/std

X_scale = preprocessing.StandardScaler().fit_transform(X)

In [46]:
# Split the reduced_data into the training and testing sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_scale, y, test_size=0.3, random_state=133)

In [53]:
print('\n======================Before StandardScalar==========================')
print(X)
print('\n=======================After StandardScalar==========================')
print(X_scale)

[[ 1.18356000e+05 -9.04911725e-01 -9.39549794e-01 ... -8.91547618e-02
   4.10001376e-02  2.67950000e+02]
 [ 1.72900000e+04  8.90523708e-01 -7.59219675e-01 ...  6.06610928e-02
   5.40860299e-02  1.65600000e+02]
 [ 3.81870000e+04 -1.12944870e+00  6.40992600e-01 ... -4.72280137e-02
   3.00811990e-02  3.56400000e+01]
 ...
 [ 4.22260000e+04 -2.11690519e-01  5.56751933e-01 ...  1.58927648e-01
   2.22483930e-01  2.77300000e+01]
 [ 7.30180000e+04  1.15502006e+00 -8.77870787e-02 ...  5.80874894e-02
   1.96268764e-02  1.00000000e+00]
 [ 1.42865000e+05 -3.15565336e-01  6.04198629e-01 ...  2.50562693e-02
   3.97947366e-02  8.28000000e+00]]

[[ 5.84134246e-01 -3.59965785e-02 -5.83708612e-01 ... -1.97913344e-01
   9.52579574e-02  6.35859474e-01]
 [-1.55916916e+00  4.46834574e-01 -5.15521857e-01 ...  3.51293799e-02
   1.36948772e-01  2.53335806e-01]
 [-1.11600715e+00 -9.63793932e-02  1.39289751e-02 ... -1.32695123e-01
   6.04709181e-02 -2.32377685e-01]
 ...
 [-1.03035221e+00  1.50425494e-01 -1.792426

### Step 2: Create and run the logistic regression model

In [71]:
# Perform Logisitic Regression with no Regularization
acc_tr_logreg, acc_ts_logreg, c_logreg, logreg_model, confusion_matrices, class_reports = sklearn_model(X_train, y_train, X_test, y_test, 0, 100000000, 10000)

Training Logistic Regression Model...
Accuracy on training data = 0.982575
Accuracy on test data = 0.974041


In [76]:
print(confusion_matrices)
print()
print(class_reports[0])
print()
print(class_reports[1])

(array([[1712,    4],
       [  32,  318]], dtype=int64), array([[739,   5],
       [ 18, 124]], dtype=int64))

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      1716
         1.0       0.99      0.91      0.95       350

    accuracy                           0.98      2066
   macro avg       0.98      0.95      0.97      2066
weighted avg       0.98      0.98      0.98      2066


              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98       744
         1.0       0.96      0.87      0.92       142

    accuracy                           0.97       886
   macro avg       0.97      0.93      0.95       886
weighted avg       0.97      0.97      0.97       886

