In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Loading the training sets and the test set 
X_train = np.load('~/creditcardfraud/np_X_train.npy')
X_test = np.load('~/creditcardfraud/np_X_test.npy')
X_train_balanced = np.load('~/creditcardfraud/np_X_train_balanced.npy')
Y_train = np.load('~/creditcardfraud/np_Y_train.npy')
Y_test = np.load('~/creditcardfraud/np_Y_test.npy')
Y_train_balanced = np.load('~/creditcardfraud/np_Y_train_balanced.npy')
X_train_balanced42 = np.load('~/creditcardfraud/np_X_train_balanced42.npy')
Y_train_balanced42 = np.load('~/creditcardfraud/np_Y_train_balanced42.npy')
X_train_balanced2 = np.load('~/creditcardfraud/np_X_train_balanced2.npy')
Y_train_balanced2 = np.load('~/creditcardfraud/np_Y_train_balanced2.npy')

In [4]:
# Assigning the classifiers
Classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Ada Boost": AdaBoostClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}

In [4]:
# Evaluating the accuracy of classifiers with cross fold validation - Overfitting
for key, classifier in Classifiers.items():
    classifier.fit(X_train, Y_train)
    Score = cross_val_score(classifier, X_train, Y_train)
    print('Classifier: ', classifier.__class__.__name__, ': {:.2f}%'.format(round(Score.mean()*100, 2)))

Classifier:  LogisticRegression : 99.89%
Classifier:  AdaBoostClassifier : 99.92%
Classifier:  RandomForestClassifier : 99.95%
Classifier:  KNeighborsClassifier : 99.83%


In [5]:
# Evaluating the accuracy of classifiers with cross fold validation
for key, classifier in Classifiers.items():
    classifier.fit(X_train_balanced, Y_train_balanced)
    Score = cross_val_score(classifier, X_train_balanced, Y_train_balanced)
    print('Classifier: ', classifier.__class__.__name__, ': {:.2f}%'.format(round(Score.mean()*100, 2)))

Classifier:  LogisticRegression : 97.26%
Classifier:  AdaBoostClassifier : 98.49%
Classifier:  RandomForestClassifier : 99.98%
Classifier:  KNeighborsClassifier : 94.78%


In [6]:
# Evaluating the accuracy of classifiers with cross fold validation - Different dataset 
for key, classifier in Classifiers.items():
    classifier.fit(X_train_balanced42, Y_train_balanced42)
    Score = cross_val_score(classifier, X_train_balanced42, Y_train_balanced42)
    print('Classifier: ', classifier.__class__.__name__, ': {:.2f}%'.format(round(Score.mean()*100, 2)))

Classifier:  LogisticRegression : 97.18%
Classifier:  AdaBoostClassifier : 98.49%
Classifier:  RandomForestClassifier : 99.98%
Classifier:  KNeighborsClassifier : 94.78%


In [7]:
# Evaluating the accuracy of classifiers with cross fold validation - Different dataset
for key, classifier in Classifiers.items():
    classifier.fit(X_train_balanced2, Y_train_balanced2)
    Score = cross_val_score(classifier, X_train_balanced2, Y_train_balanced2)
    print('Classifier: ', classifier.__class__.__name__, ': {:.2f}%'.format(round(Score.mean()*100, 2)))

Classifier:  LogisticRegression : 96.55%
Classifier:  AdaBoostClassifier : 98.50%
Classifier:  RandomForestClassifier : 99.98%
Classifier:  KNeighborsClassifier : 94.78%


In [5]:
from sklearn.metrics import confusion_matrix

In [9]:
# Running the classiffiers
for key, classifier in Classifiers.items():
    classifier_fitted = classifier.fit(X_train_balanced, Y_train_balanced.ravel())
    y_pred = classifier_fitted.predict(X_test)
    print('\n Classifier: ', classifier.__class__.__name__,'\n', classification_report(Y_test, y_pred))
    print('\n Confusion Matrix: ', classifier.__class__.__name__,'\n', confusion_matrix(Y_test, y_pred))
    print('\n ROC Score: ', classifier.__class__.__name__, '\n', roc_auc_score(Y_test,y_pred))


 Classifier:  LogisticRegression 
               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     85308
         1.0       0.09      0.84      0.16       135

    accuracy                           0.99     85443
   macro avg       0.55      0.91      0.58     85443
weighted avg       1.00      0.99      0.99     85443


 Confusion Matrix:  LogisticRegression 
 [[84174  1134]
 [   22   113]]

 ROC Score:  LogisticRegression 
 0.9118720140875156

 Classifier:  AdaBoostClassifier 
               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     85308
         1.0       0.12      0.84      0.20       135

    accuracy                           0.99     85443
   macro avg       0.56      0.92      0.60     85443
weighted avg       1.00      0.99      0.99     85443


 Confusion Matrix:  AdaBoostClassifier 
 [[84434   874]
 [   21   114]]

 ROC Score:  AdaBoostClassifier 
 0.9170996076960348

 Classifier:  Rando

In [10]:
# Running the classifiers - Different training set
for key, classifier in Classifiers.items():
    classifier_fitted = classifier.fit(X_train_balanced42, Y_train_balanced42.ravel())
    y_pred = classifier_fitted.predict(X_test)
    print('\n Classifier: ', classifier.__class__.__name__,'\n', classification_report(Y_test, y_pred))
    print('\n Confusion Matrix: ', classifier.__class__.__name__,'\n', confusion_matrix(Y_test, y_pred))
    print('\n ROC Score: ', classifier.__class__.__name__, '\n', roc_auc_score(Y_test, y_pred))


 Classifier:  LogisticRegression 
               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     85308
         1.0       0.09      0.84      0.16       135

    accuracy                           0.99     85443
   macro avg       0.55      0.91      0.58     85443
weighted avg       1.00      0.99      0.99     85443


 Confusion Matrix:  LogisticRegression 
 [[84172  1136]
 [   22   113]]

 ROC Score:  LogisticRegression 
 0.9118602918574785

 Classifier:  AdaBoostClassifier 
               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     85308
         1.0       0.12      0.84      0.20       135

    accuracy                           0.99     85443
   macro avg       0.56      0.91      0.60     85443
weighted avg       1.00      0.99      0.99     85443


 Confusion Matrix:  AdaBoostClassifier 
 [[84453   855]
 [   22   113]]

 ROC Score:  AdaBoostClassifier 
 0.9135072651776829

 Classifier:  Rando

In [6]:
# Running the classifiers - Different training set
for key, classifier in Classifiers.items():
    classifier_fitted = classifier.fit(X_train_balanced2, Y_train_balanced2.ravel())
    y_pred = classifier_fitted.predict(X_test)
    print('\n Classifier: ', classifier.__class__.__name__,'\n', classification_report(Y_test, y_pred))
    print('\n Confusion Matrix: ', classifier.__class__.__name__,'\n', confusion_matrix(Y_test, y_pred))
    print('\n ROC Score: ', classifier.__class__.__name__, '\n', roc_auc_score(Y_test, y_pred))


 Classifier:  LogisticRegression 
               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     85308
         1.0       0.09      0.84      0.16       135

    accuracy                           0.99     85443
   macro avg       0.54      0.91      0.58     85443
weighted avg       1.00      0.99      0.99     85443


 Confusion Matrix:  LogisticRegression 
 [[84169  1139]
 [   22   113]]

 ROC Score:  LogisticRegression 
 0.911842708512423

 Classifier:  AdaBoostClassifier 
               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99     85308
         1.0       0.11      0.85      0.20       135

    accuracy                           0.99     85443
   macro avg       0.55      0.92      0.59     85443
weighted avg       1.00      0.99      0.99     85443


 Confusion Matrix:  AdaBoostClassifier 
 [[84379   929]
 [   20   115]]

 ROC Score:  AdaBoostClassifier 
 0.9204809500737199

 Classifier:  Random