## Introduction

> This project is concerned with the credit card fraud detection in imbalanced data. Particularly, the project aims at  proposing a novel model referred to as SMOTE-CS-SVM integrating the application of The Synthetic Minority Oversampling Technique (SMOTE) with Cost sensitive support vector machine (CS-SVM). Importantly, the results reveal the superiority of the proposed model over other state-of-the-art imbalanced classifiers in terms of accuracy and ROC-AUC scores.

In [76]:
# Imported Libraries
import numpy as np
from numpy import linalg
import cvxopt
import cvxopt.solvers
import pandas as pd
#from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from cvxopt import matrix as cvxopt_matrix
from cvxopt import solvers as cvxopt_solvers
from sklearn import svm
import math 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import roc_auc_score

In [78]:
# Loading data and print out a few lines. 
df = pd.read_csv(r"C:\Users\Kimo Store\fraud_data.csv")

df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,1.176563,0.323798,0.536927,1.047002,-0.368652,-0.728586,0.084678,-0.069246,-0.266389,0.155315,1.535776,1.019947,-0.096511,0.573023,0.215214,0.265249,-0.612064,0.093821,-0.06489,-0.137258,-0.109627,-0.341365,0.057845,0.49918,0.415211,-0.581949,0.015472,0.018065,4.67,0
1,0.681109,-3.934776,-3.801827,-1.147468,-0.73554,-0.501097,1.038865,-0.626979,-2.274423,1.527782,-0.007688,-1.087403,-0.72027,0.855185,-1.070011,-0.869661,0.395302,0.635695,0.249401,1.341809,0.652202,0.272684,-0.982151,0.1659,0.360251,0.195321,-0.256273,0.056501,912.0,0
2,1.140729,0.453484,0.24701,2.383132,0.343287,0.432804,0.09338,0.17331,-0.808999,0.775436,0.726218,0.347648,-0.806752,0.531268,-0.806538,0.154996,-0.319935,-0.33555,-0.648994,-0.232185,-0.003802,0.058556,-0.121177,-0.304215,0.645893,0.1226,-0.012115,-0.005945,1.0,0
3,-1.107073,-3.298902,-0.184092,-1.795744,2.137564,-1.684992,-2.015606,-0.007181,-0.16576,0.869659,-1.027847,-3.220699,3.007008,0.794679,-1.504351,-0.380985,0.667344,0.676138,-0.419469,0.348269,0.130648,0.329445,0.927656,-0.04956,-1.892866,-0.575431,0.266573,0.414184,62.1,0
4,-0.314818,0.866839,-0.124577,-0.627638,2.651762,3.428128,0.194637,0.670674,-0.442658,0.133499,0.148566,-0.474103,-0.011319,-0.410223,1.648979,0.218394,-0.239475,0.422933,1.274986,0.402329,-0.312774,-0.799494,-0.064488,0.953062,-0.42955,0.158225,0.076943,-0.015051,2.67,0


In [82]:
# checking the data shape (we have 21693 observations with 30 features)
df.shape

(21693, 30)

# Describe the data

In [80]:
df.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0,21693.0
mean,-0.032403,0.047565,-0.091623,0.057805,-0.033983,-0.023207,-0.074203,0.002096,-0.044311,-0.091073,0.067173,-0.094268,-0.000667,-0.091499,-0.003917,-0.055079,-0.098357,-0.033488,0.021861,-0.001762,0.012471,0.003743,-0.001662,-0.002446,-0.000406,0.002367,0.001514,0.003203,86.776247,0.016411
std,2.106997,1.690911,1.870289,1.540329,1.530508,1.340599,1.596775,1.41265,1.158554,1.354886,1.154227,1.364933,0.989655,1.356199,0.916582,1.096472,1.424975,0.936769,0.843902,0.727979,0.850009,0.741348,0.629987,0.600144,0.520949,0.478279,0.424688,0.302048,235.644479,0.127052
min,-41.928738,-40.803981,-31.103685,-4.848504,-32.092129,-20.367836,-41.506796,-38.987263,-13.434066,-24.403185,-3.995739,-18.553697,-3.844974,-19.214325,-4.498945,-14.129855,-24.019099,-9.498746,-4.395283,-21.024817,-21.453736,-8.887017,-21.303666,-2.766638,-4.541819,-1.855355,-7.764147,-6.520075,0.0,0.0
25%,-0.929371,-0.592921,-0.962975,-0.850069,-0.698296,-0.779041,-0.565297,-0.205943,-0.669752,-0.554596,-0.739193,-0.43873,-0.63403,-0.438416,-0.582193,-0.492689,-0.498984,-0.513216,-0.444441,-0.20971,-0.225142,-0.538258,-0.162395,-0.356356,-0.317296,-0.326141,-0.069938,-0.053334,5.37,0.0
50%,0.007545,0.075215,0.176534,-0.012868,-0.063948,-0.281565,0.030859,0.023159,-0.073996,-0.099291,0.005596,0.126666,-0.018642,0.04487,0.048753,0.059504,-0.075797,-0.019269,0.021719,-0.05733,-0.024133,0.007273,-0.012327,0.036878,0.011561,-0.04487,0.002475,0.011765,21.95,0.0
75%,1.315678,0.819749,1.020809,0.772388,0.615287,0.383633,0.563751,0.328411,0.590212,0.445474,0.786044,0.613676,0.652241,0.490003,0.642463,0.525327,0.389992,0.4947,0.48493,0.139059,0.192954,0.530333,0.146616,0.431931,0.354043,0.238629,0.095859,0.081749,76.48,0.0
max,2.451888,21.467203,4.069865,12.114672,29.162172,21.393069,34.303177,20.007208,9.125535,12.701539,12.018913,3.966626,4.099352,6.441021,5.720479,6.442798,6.609366,3.790316,4.851255,13.119819,27.202839,8.361985,15.626067,4.014444,5.541598,3.463246,9.879903,9.876371,7712.43,1.0


In [83]:
# inspect data types and look for missing values (no missing values)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21693 entries, 0 to 21692
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      21693 non-null  float64
 1   V2      21693 non-null  float64
 2   V3      21693 non-null  float64
 3   V4      21693 non-null  float64
 4   V5      21693 non-null  float64
 5   V6      21693 non-null  float64
 6   V7      21693 non-null  float64
 7   V8      21693 non-null  float64
 8   V9      21693 non-null  float64
 9   V10     21693 non-null  float64
 10  V11     21693 non-null  float64
 11  V12     21693 non-null  float64
 12  V13     21693 non-null  float64
 13  V14     21693 non-null  float64
 14  V15     21693 non-null  float64
 15  V16     21693 non-null  float64
 16  V17     21693 non-null  float64
 17  V18     21693 non-null  float64
 18  V19     21693 non-null  float64
 19  V20     21693 non-null  float64
 20  V21     21693 non-null  float64
 21  V22     21693 non-null  float64
 22

In [85]:
#The percentage of the observations in the dataset are instances of fraud (the data is highly imbalanced)
fraudInstances = float(sum(df['Class']==1))
fraudInstances/len(df)

0.016410823768035772

In [86]:
# Splitting the features into dependent and explanatory features
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X.shape, y.shape

((21693, 30), (21693, 29), (21693,))

# SVM Classifier with RBF kernel before applying SMOTE

In [137]:
# splitting original data into train and test
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y,test_size=0.2,random_state=10)

In [139]:
clf_svm1 = svm.SVC(kernel='rbf', gamma=0.0007, C=10000)
clf_svm1.fit(X_train1, y_train1)
y_pred_svm1 = clf_svm1.predict(X_test1) 
acc_svm1 = accuracy_score(y_test1, y_pred_svm1)
auc_svm1 = roc_auc_score(y_test1, y_pred_svm1, average=None)
print ("Overall RBF KERNEL SVM accuracy: ",acc_svm1)
print ("Overall RBF KERNEL SVM ROC_AUC: ", auc_svm1)
print(confusion_matrix(y_test1, y_pred_svm1))

Overall RBF KERNEL SVM accuracy:  0.9923945609587462
Overall RBF KERNEL SVM ROC_AUC:  0.874954241565998
[[4251   15]
 [  18   55]]


# CS-SVM Classifier with RBF kernel before applying SMOTE

In [150]:
cs_svm1 = svm.SVC(kernel='rbf', class_weight={0: 1, 1: 4}, gamma=0.0007, C=10000) 

cs_svm1.fit(X_train1, y_train1)
y_pred_cs_svm1 = cs_svm1.predict(X_test1) 
acc_cs_svm1 = accuracy_score(y_test1, y_pred_cs_svm1)
auc_cs_svm1 = roc_auc_score(y_test1, y_pred_cs_svm1, average=None)
print ("Overall RBF KERNEL CS-SVM accuracy: ", acc_cs_svm1)
print ("Overall RBF KERNEL CS-SVM ROC_AUC: ", auc_cs_svm1)
print(confusion_matrix(y_test1, y_pred_cs_svm1))

Overall RBF KERNEL CS-SVM accuracy:  0.9912422217100715
Overall RBF KERNEL CS-SVM ROC_AUC:  0.8743682124989564
[[4246   20]
 [  18   55]]


# Easy Ensemble Classifier before applying SMOTE

In [141]:
from imblearn.ensemble import EasyEnsembleClassifier 

eec1 = EasyEnsembleClassifier(random_state=42)
eec1.fit(X_train1, y_train1) 
EasyEnsembleClassifier(...)
y_pred_eec1 = eec1.predict(X_test1)
acc_eec1 = accuracy_score(y_test1, y_pred_eec1)

auc_eec1 = roc_auc_score(y_test1, y_pred_eec1, average=None)
print ("Overall Easy Ensemble Classifier accuracy: ", acc_eec1)
print ("Overall Easy Ensemble Classifier ROC_AUC: ", auc_eec1)
print(confusion_matrix(y_test1, y_pred_eec1))

Overall Easy Ensemble Classifier accuracy:  0.9596681262963817
Overall Easy Ensemble Classifier ROC_AUC:  0.96602476414337
[[4093  173]
 [   2   71]]


## SMOTE Technique (Over-Sampling):
The Synthetic Minority Oversampling Technique where new examples can be synthesized from the existing examples. This is a type of data augmentation for the minority class to deal with the problem of imbalanced data.

In [94]:
## applying oversampling technique to make the ratio of minority class to majority class 1 to 6
sm = SMOTE(sampling_strategy ={1: 3560, 0: 21337}, random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 21337, 1: 3560})


In [95]:
## splitting synthetic data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res,test_size=0.2,random_state=10)


(4980, 29)

## Comparing CS-SVM with state-of-the-art calssifiers in dealing with Imbalanced data after  applying SMOTE

## Logistic Regression

In [103]:
from sklearn import datasets, linear_model

estimator = linear_model.LogisticRegression(solver="liblinear", multi_class="ovr")
estimator.fit(X_train, y_train)
y_pred_lr = estimator.predict(X_test) 
acc_lr = accuracy_score(y_test, y_pred_lr)

auc_cs_lr = roc_auc_score(y_test, y_pred_lr, average=None)
print ("Overall LR accuracy: ",acc_lr)

print ("Overall LR ROC_AUC: ", auc_cs_lr)

print(confusion_matrix(y_test, y_pred_lr))

Overall LR accuracy:  0.9785140562248996
Overall LR ROC_AUC:  0.9392424364419573
[[4209   19]
 [  88  664]]


# SVM Classifier with RBF kernel

In [135]:

clf_svm = svm.SVC(kernel='rbf', gamma=0.0007, C=10000)
clf_svm.fit(X_train, y_train)
y_pred_svm = clf_svm.predict(X_test) 
acc_svm = accuracy_score(y_test, y_pred_svm)
auc_svm = roc_auc_score(y_test, y_pred_svm, average=None)
print ("Overall RBF KERNEL SVM accuracy: ",acc_svm)
print ("Overall RBF KERNEL SVM ROC_AUC: ", auc_svm)
print(confusion_matrix(y_test, y_pred_svm))

Overall RBF KERNEL SVM accuracy:  0.9939759036144579
Overall RBF KERNEL SVM ROC_AUC:  0.9915325137381993
[[4207   21]
 [   9  743]]


# Cost-sensitive SVM Classifier with RBF kernel

In [149]:
cs_svm = svm.SVC(kernel='rbf', class_weight={0: 1, 1: 4}, gamma=0.0007, C=10000) 

cs_svm.fit(X_train, y_train)#0:1,1:2
y_pred_cs_svm = cs_svm.predict(X_test) 
acc_cs_svm = accuracy_score(y_test, y_pred_cs_svm)
auc_cs_svm = roc_auc_score(y_test, y_pred_cs_svm, average=None)
print ("Overall RBF KERNEL CS-SVM accuracy: ", acc_cs_svm)
print ("Overall RBF KERNEL CS-SVM ROC_AUC: ", auc_cs_svm)
print(confusion_matrix(y_test, y_pred_cs_svm))

Overall RBF KERNEL CS-SVM accuracy:  0.9943775100401606
Overall RBF KERNEL CS-SVM ROC_AUC:  0.9923156665794401
[[4208   20]
 [   8  744]]


## Easy Ensemble Classifier

In [107]:
from imblearn.ensemble import EasyEnsembleClassifier 

eec = EasyEnsembleClassifier(random_state=42)
eec.fit(X_train, y_train) 
EasyEnsembleClassifier(...)
y_pred_eec = eec.predict(X_test)
acc_eec = accuracy_score(y_test, y_pred_eec)

auc_eec = roc_auc_score(y_test, y_pred_eec, average=None)
print ("Overall Easy Ensemble Classifier accuracy: ", acc_eec)
print ("Overall Easy Ensemble Classifier ROC_AUC: ", auc_eec)
print(confusion_matrix(y_test, y_pred_eec))

Overall Easy Ensemble Classifier accuracy:  0.9825301204819277
Overall Easy Ensemble Classifier ROC_AUC:  0.9760455876728598
[[4166   62]
 [  25  727]]


## AdaBoost Classifier

In [124]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=50)

bdt.fit(X_train, y_train) 
y_pred_bdt = bdt.predict(X_test)
acc_bdt = accuracy_score(y_test, y_pred_bdt)

auc_bdt = roc_auc_score(y_test, y_pred_bdt, average=None)
print ("Overall AdaBoost Classifier accuracy: ", acc_bdt)
print ("Overall AdaBoost Classifier ROC_AUC: ", auc_bdt)
print(confusion_matrix(y_test, y_pred_bdt))

Overall AdaBoost Classifier accuracy:  0.9761044176706827
Overall AdaBoost Classifier ROC_AUC:  0.9378233257513234
[[4197   31]
 [  88  664]]


## Cost-sensitive AdaBoost Classifier

In [117]:

# Create and fit an AdaBoosted decision tree sample weighted

sample_weight_constant = np.ones(len(y_train))
# and bigger weights to some outliers
sample_weight_constant[np.where(y_train == 1)] *= 5
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=50)

y_pred_cs_bdt = bdt.fit(X_train, y_train, sample_weight_constant).predict(X_test) 
acc_cs_bdt = accuracy_score(y_test, y_pred_cs_bdt)

auc_cs_bdt = roc_auc_score(y_test, y_pred_cs_bdt, average=None)
print ("Overall Cost-sensitive AdaBoost Classifier accuracy: ", acc_cs_bdt)
print ("Overall Cost-sensitive AdaBoost Classifier ROC_AUC: ", auc_cs_bdt)
print(confusion_matrix(y_test, y_pred_cs_bdt))


Overall Cost-sensitive AdaBoost Classifier accuracy:  0.9736947791164658
Overall Cost-sensitive AdaBoost Classifier ROC_AUC:  0.9544431500231486
[[4152   76]
 [  55  697]]


## Random Forest Classifier

In [134]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

rfc = RandomForestClassifier(max_depth=4, random_state=0)
rfc.fit(X_train, y_train) 
RandomForestClassifier(...)

y_pred_rfc = rfc.predict(X_test)
acc_cs_rfc = accuracy_score(y_test, y_pred_rfc)

auc_rfc = roc_auc_score(y_test, y_pred_rfc, average=None)
print ("Overall Random Forest Classifier accuracy: ", acc_cs_rfc)
print ("Overall Random Forest Classifier ROC_AUC: ", auc_rfc)
print(confusion_matrix(y_test, y_pred_rfc))


Overall Random Forest Classifier accuracy:  0.9771084337349397
Overall Random Forest Classifier ROC_AUC:  0.9252953964451781
[[4226    2]
 [ 112  640]]


## Conclusion

Cost sensitive support vector machine (CS-SVM) is another version of support vector machine (SVM) that is modified in order to deal with the issue of class imbalance. I propose a model referred to as SMOTE-CS-SVM integrating the application of The Synthetic Minority Oversampling Technique (SMOTE) with CS-SVM . The experimental findings reveal that the proposed model outperforms not only LR, SVM and CS-SVM but also three other state-of-the-art classifiers including EasyEnsemble, cost-sensitive adaptive boosting, random forest. In this regard, I conclude that the proposed  model is a decent and practical approach to support credit card fraud detection of highly imbalanced data.