In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
%matplotlib inline


In [2]:
rawdata = pd.read_csv('../../datasets/creditcardfraud/creditcard.csv')

print(rawdata.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...         V21       V22       V23       V24  \
0  0.098698  0.363787  ...   -0.018307  0.277838 -0.110474  0.066928   
1  0.085102 -0.255425  ...   -0.225775 -0.638672  0.101288 -0.339846   
2  0.247676 -1.514654  ...    0.247998  0.771679  0.909412 -0.689281   
3  0.377436 -1.387024  ...   -0.108300  0.005274 -0.190321 -1.175575   
4 -0.270533  0.817739  ...   -0.009431  0.798278 -0.137458  0.141267   

        V25       V26       V27       V28  Amount  Class  
0  0.128539 -0.189115

Someone already performed PCA on the data, so the features are not correlated with each other. This can be verified by checking a correlation matrix of the data. For the record, Time is the number of seconds passed since the data began being collected (observations where Time = 0). Amount is the amount of the purchase, and Class is either 1 or 0. Class = 1 indicates a fraudulent transaction. Class = 0 indicates a legitimate transaction. 

In [3]:
x = rawdata.drop('Class', axis=1)
y = rawdata['Class']

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.2, random_state=10)

Let's try a random forest, an SVM, a boosting classifier, and a logistic regression, and optimize them a bit.

First we'll implement a random forest.

In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 10, min_samples_leaf=50, n_jobs=-1)
rfc.fit(x, y)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [6]:
yconfidences_rfc = rfc.predict_proba(xtest)


In [7]:
avg_precision_rfc = average_precision_score(ytest, yconfidences_rfc[:, 1])
print('Average Precision-Recall Score: {0:0.5f}'.format(avg_precision_rfc))

Average Precision-Recall Score: 0.86553


In [8]:
rfc_params = {
    'n_estimators' : [10, 20, 30],
    'min_samples_leaf' : [25, 50, 100],
}

gridsearch_rfc_auc = GridSearchCV(rfc, rfc_params, cv=3, scoring='roc_auc')

gridsearch_rfc_auc.fit(xtrain, ytrain)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 30], 'min_samples_leaf': [25, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [9]:
#This is where we'll retrieve the best params and add them to the model once the above cell finishes running
print(gridsearch_rfc_auc.best_params_)
rfc.set_params(**gridsearch_rfc_auc.best_params_)

{'min_samples_leaf': 100, 'n_estimators': 30}


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=100, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
from sklearn.svm import LinearSVC

#create a linear support vector classifier model
svc = LinearSVC()
svc.get_params()

svc.fit(xtrain, ytrain)
yconfidences_svc = svc.decision_function(xtest)

In [11]:
#define the SVC parameters to test
svc_params = {
    'C' : [.2, .8, 1]
}

#search for the best parameters
gridsearch_svc_auc = GridSearchCV(svc, svc_params, cv=3, scoring='roc_auc')

gridsearch_svc_auc.fit(xtrain, ytrain)

GridSearchCV(cv=3, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.2, 0.8, 1]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=0)

In [12]:
#Retrieve the best parameters from the search and apply them to our model
print(gridsearch_svc_auc.best_params_)
svc.set_params(**gridsearch_svc_auc.best_params_)

{'C': 0.2}


LinearSVC(C=0.2, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

#create a gradient boosting model
gbc = GradientBoostingClassifier(max_features = 5)
gbc.get_params()

gbc.fit(xtrain, ytrain)
yconfidences_gbc = gbc.decision_function(xtest)

In [20]:
#define parameters to test
gbc_params = {
    'min_samples_split' : [2, 3, 5],
    'max_depth' : [1, 3, 5, 8, 10],
    'n_estimators' : [100, 150, 200]
}

#search for the optimal parameters
gridsearch_gbc_auc = GridSearchCV(gbc, gbc_params, cv=3, scoring='roc_auc')

gridsearch_gbc_auc.fit(xtrain, ytrain)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=5, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 5], 'max_depth': [1, 3, 5, 8, 10], 'n_estimators': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [21]:
#set the optimized parameters from the gridsearch
print(gridsearch_gbc_auc.best_params_)
gbc.set_params(**gridsearch_gbc_auc.best_params_)

{'max_depth': 8, 'min_samples_split': 2, 'n_estimators': 100}


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features=5, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [26]:
from sklearn.linear_model import LogisticRegressionCV

#create a logistic regression model
#Selection of an optimized C value is BUILT IN to this model
lrcv = LogisticRegressionCV(n_jobs=-1)
print(lrcv.get_params())

lrcv.fit(xtrain, ytrain)

{'Cs': 10, 'class_weight': None, 'cv': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1.0, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'refit': True, 'scoring': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0}


LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [36]:
print(rfc.predict_proba(xtest)[:, 1])
print(rfc.predict(xtest))
rfc.predict_proba(xtest)

[0. 0. 0. ... 0. 0. 0.]
[0 0 0 ... 0 0 0]


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [39]:
#now test the rfc, svc, gbc, and lr(cv) on the test data
#we have a random forest classifier, support vector machine, gradient boosting classifer, and logistic regression model
from sklearn.metrics import roc_auc_score 

#sic rfc on test data
rfc_roc_auc = roc_auc_score(ytest, rfc.predict_proba(xtest)[:, 1])
print('Random Forest Classifier AUROC: ' + str(rfc_roc_auc))

#sic svc on test data
#the decision function returns the distance of a prediction from the hyperplane, I guess, which I guess is like confidence
#since a higher distance from the hyperplane should mean higher confidence
svc_roc_auc = roc_auc_score(ytest, svc.decision_function(xtest))
print('Support Vector Classifier AUROC: ' + str(svc_roc_auc))

#sic gbc on test data
gbc_roc_auc = roc_auc_score(ytest, gbc.predict_proba(xtest)[:, 1])
print('Gradient Boosting Classifier AUROC: ' + str(gbc_roc_auc))

#sic lrcv on test data
lrcv_roc_auc = roc_auc_score(ytest, lrcv.predict_proba(xtest)[:, 1])
print('Logistic Regression AUROC: '  + str(lrcv_roc_auc))


Random Forest Classifier AUROC: 0.9991385425599261
Support Vector Classifier AUROC: 0.8607375946387229
Gradient Boosting Classifier AUROC: 0.8570422134723339
Logistic Regression AUROC: 0.9469342216914423


In [40]:
rfc.score(xtest, ytest)

0.999420666409185

It looks like the random forest classifier does an extremely great job of predicting fraudulent credit card transactions correctly. Nice!