In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
y = df['Class']
X = df.drop(columns=['Class','Time','Amount'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=123)

In [None]:
# col_names = ['Amount','Time']
# features = X_train_standard [col_names]
# ss = StandardScaler()
# scaler = ss.fit(features.values)
# features = scaler.transform(features.values)
# X_train_standard [col_names]=features


In [None]:
#X_train_standard['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))

In [4]:
X_train_resampled, y_train_resampled = SMOTE().fit_sample(X_train, y_train)
print(pd.Series(y_train_resampled).value_counts())

1    227468
0    227468
Name: Class, dtype: int64


In [5]:
rus = RandomUnderSampler()
X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
print(pd.Series(y_train_rus).value_counts()) 

1    377
0    377
Name: Class, dtype: int64


In [6]:
#Function
def perform_evaluate(title,cm,X,y):
    print(title)
    y_pred = cm.predict(X)
    print("accuracy: {}".format(accuracy_score(y,y_pred)))
    print("precision: {}".format(precision_score(y,y_pred)))
    print("recall: {}".format(recall_score(y,y_pred)))
    print("f2: {}".format(f1_score(y,y_pred,average='binary')))
    print(" ")

def grid_search(cm, param_grid, cv, X_train,y_train,X_test, y_test ):
    rf_grid_search = GridSearchCV(rf_clf, rf_param_grid, cv=3,
                              return_train_score=True)
    rf_grid_search.fit(X_train, y_train)

    rf_grid_training_score = np.mean(rf_grid_search.cv_results_['mean_train_score'])
    rf_grid_score = rf_grid_search.score(X_test, y_test)
    rf_grid_search.best_params_
    print(f"Mean Training Score: {rf_grid_training_score :.2%}")
    print(f"Mean Test Score: {rf_grid_score :.2%}")
    print(f"Best Parameter Combination Found During Grid Search: {rf_grid_search.best_params_ :}")

def plot_conf_matrix(cm,x,y,class_names,title):
    plot_confusion_matrix(estimator = cm, X=x, y_true=y,
                          display_labels=class_names,cmap=plt.cm.Blues)
    plt.title(title)
    plt.show()
    
def plot_roc(cm,X,y,title):
    y_pred = cm.predict(X)
    plot_roc_curve(cm, X, y)
    plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
    plt.title(title)
    plt.show() 
    
class_names = ["Normal","Fraud"]

In [7]:
rf_clf = RandomForestClassifier(criterion='gini', max_depth=5, random_state = 123) 
rf_clf.fit(X_train_rus, y_train_rus)

RandomForestClassifier(max_depth=5, random_state=123)

In [8]:
perform_evaluate("Random Forest Classifier Traning data:",rf_clf,X_train_rus,y_train_rus)
perform_evaluate("Random Forest Classifier Test data:",rf_clf,X_test,y_test)

Random Forest Classifier Traning data:
accuracy: 0.9602122015915119
precision: 1.0
recall: 0.9204244031830239
f2: 0.9585635359116021
 
Random Forest Classifier Test data:
accuracy: 0.9846388820617253
precision: 0.10824742268041238
recall: 0.9130434782608695
f2: 0.19354838709677422
 


In [9]:
rf_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None,6],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1,3],
    'n_estimators': [200, 500]
}

In [None]:
grid_search(rf_clf, rf_param_grid, 3, X_train_rus, y_train_rus, X_test, y_test )

In [None]:
rf=  RandomForestClassifier(criterion='entropy',
                           max_depth=None,
                           min_samples_split=5,
                           min_samples_leaf=1,
                           n_estimators= 500)
rf.fit(X_train_rus, y_train_rus)
perform_evaluate("Random Forest Classifier with GridSearch Traning data:",rf_clf,X_train_rus,y_train_rus)
perform_evaluate("Random Forest Classifier with GridSearch Test data:",rf_clf,X_test,y_test)

In [None]:
plot_conf_matrix(rf,X_train_rus, y_train_rus,class_names,'Random Farest Confusion matrix')
plot_roc(rf,X_train_rus, y_train_rus,'Random Farest ROC Curve')


In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(rf, X_train_rus, y_train_rus, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('Random Farest Confusion matrix')
plt.show()

from sklearn.metrics import plot_roc_curve, classification_report
y_pred = rf.predict(X_train_rus)

plot_roc_curve(rf, X_train_rus, y_train_rus)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
#print(classification_report( y_train_rus, y_pred, target_names=class_names))

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(rf, X_test, y_test, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('Random Farest Confusion matrix for Training Data')
plt.show()


In [None]:
y_pred = rf_clf.predict(X_test)
y_score = rf_clf.predict_proba(X_test)
lw=2
plot_roc_curve(rf_clf, X_test, y_test)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve')
plt.show()
print(classification_report( y_test, y_pred, target_names=class_names))

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_rus, y_train_rus)

from sklearn.model_selection import RandomizedSearchCV
param_grid = {'C':[0.1,1, 10, 100],'penalty' : ['l1','l2'], 
            'class_weight' : ['balanced', None]}

logre_rs_search = RandomizedSearchCV(logreg, param_grid,cv=5,
                                     return_train_score=True,
                                     verbose = 1, n_jobs = -1)
logre_rs_search.fit(X_train_rus, y_train_rus)
logre_rs_training_score = np.mean(logre_rs_search.cv_results_['mean_train_score'])
logre_testing_score = logre_rs_search.score(X_test, y_test)

print(f"Mean Training Score: {logre_rs_training_score :.2%}")
print(f"Mean Test Score: {logre_testing_score :.2%}")
print("Best Parameter Combination Found During Randomized Search:")
logre_rs_search.best_params_

In [None]:
logre = LogisticRegression(penalty= 'l2',class_weight= 'balanced', C= 0.1)
logre.fit(X_train_rus, y_train_rus)

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(logre, X_train_rus, y_train_rus, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('Logistic Regression Confusion matrix')
plt.show()


y_pred = logre.predict(X_train_rus)
y_score =logre.predict_proba(X_train_rus)
lw=2
plot_roc_curve(logre, X_train_rus, y_train_rus)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_train_rus, y_pred, target_names=class_names))

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(logre, X_test, y_test, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('Logistic Regression Confusion matrix for Test Data')
plt.show()
from sklearn.metrics import plot_roc_curve, classification_report
y_pred = logre.predict(X_test)
y_score = logre.predict_proba(X_test)
lw=2
plot_roc_curve(logre, X_test, y_test)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve')
plt.show()
print(classification_report( y_test, y_pred, target_names=class_names))

In [None]:
#Extreme Gradient Boosting (XGB)
import xgboost as xgb
xgb_cfl = xgb.XGBClassifier(n_jobs = -1)
xgb_cfl.fit(X_train_rus, y_train_rus)
y_pred = xgb_cfl.predict(X_test)
y_score = xgb_cfl.predict_proba(X_test)[:,1]

In [None]:
from sklearn.metrics import accuracy_score
param_grid = {'n_estimators': [100, 200, 300, 400],
             "learning_rate": [0.1],
             'max_depth': [6],
             'min_child_weight': [10],
             'subsample': [ 0.7],}
CV_xgb_cfl = GridSearchCV(estimator = xgb_cfl, param_grid = param_grid,
                          scoring ='f1')
CV_xgb_cfl.fit(X_train_rus, y_train_rus)

training_preds = CV_xgb_cfl.predict(X_train_rus)
val_preds = CV_xgb_cfl.predict(X_test)
training_accuracy = accuracy_score(y_train_rus, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))


best_parameters = CV_xgb_cfl.best_params_
print("The best parameters for using this model is", best_parameters)


In [None]:
xgb_cfl = xgb.XGBClassifier(n_jobs = -1, learning_rate=0.1,
                            max_depth=6, min_child_weight= 10,
                            n_estimators= 300, subsample= 0.7)
xgb_cfl.fit(X_train_rus, y_train_rus)


In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(xgb_cfl, X_train_rus, y_train_rus, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('XGB Confusion matrix')
plt.show()

y_pred = xgb_cfl.predict(X_train_rus)
y_score = xgb_cfl.predict_proba(X_train_rus)[:,1]
lw=2
plot_roc_curve(xgb_cfl, X_train_rus, y_train_rus)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_train_rus, y_pred, target_names=class_names))

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(xgb_cfl, X_test, y_test, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('Logistic Regression Confusion matrix')
plt.show()

y_pred = xgb_cfl.predict(X_test)
y_score = xgb_cfl.predict_proba(X_test)[:,1]
lw=2
plot_roc_curve(xgb_cfl, X_test, y_test)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_test, y_pred, target_names=class_names))

In [None]:
#GBM
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
# Instantiate an GradientBoostingClassifier
gbt_clf = GradientBoostingClassifier(random_state=42)
gbt_clf.fit(X_train_rus, y_train_rus)

In [None]:
param_grid = {'n_estimators': [100, 300, 400],'learning_rate': [0.01,.005],
              'max_depth':[5,6], 'min_samples_split':[5,10],
              'min_samples_leaf':[5,6], 'subsample':[0.85], 'random_state':[10],
              'max_features':[7]}
gbt_clf_grid = GridSearchCV(estimator = gbt_clf, param_grid = param_grid,
                          scoring ='f1')
gbt_clf_grid.fit(X_train_rus, y_train_rus)

training_preds = gbt_clf_grid.predict(X_train_rus)
val_preds = gbt_clf_grid.predict(X_test)
training_accuracy = accuracy_score(y_train_rus, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))


best_parameters = gbt_clf_grid.best_params_
print("The best parameters for using this model is", best_parameters)

In [None]:
gbt_clf = GradientBoostingClassifier(learning_rate= 0.01, max_depth= 5,max_features=7,
                                     min_samples_leaf=5,min_samples_split=5,
                                     n_estimators= 300, subsample=0.85, random_state=10)
gbt_clf.fit(X_train_rus, y_train_rus)

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(gbt_clf, X_train_rus, y_train_rus, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('GradientBoostingClassifier Confusion matrix')
plt.show()

y_pred = gbt_clf.predict(X_train_rus)
y_score = gbt_clf.predict_proba(X_train_rus)[:,1]
lw=2
plot_roc_curve(gbt_clf, X_train_rus, y_train_rus)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_train_rus, y_pred, target_names=class_names))

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(gbt_clf, X_test, y_test, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('GradientBoostingClassifier Confusion matrix')
plt.show()

y_pred = gbt_clf.predict(X_test)
y_score = gbt_clf.predict_proba(X_test)[:,1]
lw=2
plot_roc_curve(gbt_clf, X_test, y_test)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_test, y_pred, target_names=class_names))

In [None]:
#Adaboost
adaboost_clf = AdaBoostClassifier(random_state=42)
# AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
#                    n_estimators=50, random_state=42)
adaboost_clf.fit(X_train_rus, y_train_rus)

In [None]:
param_grid = {'n_estimators': [100, 300, 400],'learning_rate': [0.01,.005],
              'max_depth':[5,6], 'min_samples_split':[5,10],
              'min_samples_leaf':[5,6], 'subsample':[0.85], 'random_state':[10],
              'max_features':[7]}
adaboost_clf_grid = GridSearchCV(estimator = gbt_clf, param_grid = param_grid,
                          scoring ='f1')
adaboost_clf_grid.fit(X_train_rus, y_train_rus)

training_preds = adaboost_clf_grid.predict(X_train_rus)
val_preds = adaboost_clf_grid.predict(X_test)
training_accuracy = accuracy_score(y_train_rus, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))


best_parameters = adaboost_clf_grid.best_params_
print("The best parameters for using this model is", best_parameters)

In [None]:
adaboost_clf = GradientBoostingClassifier(learning_rate= 0.01, max_depth= 5,max_features=7,
                                     min_samples_leaf=5,min_samples_split=5,
                                     n_estimators= 300, subsample=0.85, random_state=10)
adaboost_clf.fit(X_train_rus, y_train_rus)

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(adaboost_clf, X_train_rus, y_train_rus, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('GradientBoostingClassifier Confusion matrix')
plt.show()

y_pred = adaboost_clf.predict(X_train_rus)
y_score = adaboost_clf.predict_proba(X_train_rus)[:,1]
lw=2
plot_roc_curve(adaboost_clf, X_train_rus, y_train_rus)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_train_rus, y_pred, target_names=class_names))

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(adaboost_clf_grid, X_test, y_test, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('GradientBoostingClassifier Confusion matrix')
plt.show()

y_pred = adaboost_clf.predict(X_test)
y_score = adaboost_clf.predict_proba(X_test)[:,1]
lw=2
plot_roc_curve(adaboost_clf, X_test, y_test)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_test, y_pred, target_names=class_names))

In [None]:
#SVM
from sklearn.svm import SVC
svm_clf = svm.SVC(random_state=42)
# AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
#                    n_estimators=50, random_state=42)
svm_clf.fit(X_train_rus, y_train_rus)

In [None]:
#SVC

gamma_range = np.array([0.1, 1, 100])
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
               'C': [1, 10, 100]},
              {'kernel': ['linear'], 'C': [1, 10, 100]}]

scores = ['precision', 'recall']

svc_clf_grid = GridSearchCV(svm_clf, param_grid = param_grid)
svc_clf_grid.fit(X_train_rus, y_train_rus)

training_preds = svc_clf_grid.predict(X_train_rus)
val_preds = svc_clf_grid.predict(X_test)
training_accuracy = accuracy_score(y_train_rus, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))


best_parameters = svc_clf_grid.best_params_
print("The best parameters for using this model is", best_parameters)

In [None]:
svm_clf = svm.SVC(kernel='rbf' ,gamma=0.001 , C=10 , random_state=42, probability=True)
svm_clf.fit(X_train_rus, y_train_rus)

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(svm_clf, X_train_rus, y_train_rus, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('SVM Confusion matrix')
plt.show()

y_pred = svm_clf.predict(X_train_rus)
y_score = svm_clf.predict_proba(X_train_rus)[:,1]
lw=2
plot_roc_curve(svm_clf, X_train_rus, y_train_rus)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_train_rus, y_pred, target_names=class_names))

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(svm_clf, X_train_rus, y_train_rus, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('SVM Confusion matrix')
plt.show()

y_pred = svm_clf.predict(X_test)
y_score = svm_clf.predict_proba(X_test)[:,1]
lw=2
plot_roc_curve(svm_clf, X_test, y_test)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_test, y_pred, target_names=class_names))

In [None]:
#Bagging
from sklearn.ensemble import BaggingClassifier
bagging_clf = BaggingClassifier(svm_clf)
bagging_clf.fit(X_train_rus, y_train_rus)

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(bagging_clf, X_train_rus, y_train_rus, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('Bagging Confusion matrix')
plt.show()

y_pred = bagging_clf .predict(X_train_rus)
y_score = bagging_clf.predict_proba(X_train_rus)[:,1]
lw=2
plot_roc_curve(bagging_clf, X_train_rus, y_train_rus)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_train_rus, y_pred, target_names=class_names))

In [None]:
class_names = ["Normal","Fraud"]
plot_confusion_matrix(bagging_clf, X_test, y_test, 
                      display_labels=class_names,
                      cmap=plt.cm.Blues)
plt.title('SVM Confusion matrix')
plt.show()

y_pred = bagging_clf.predict(X_test)
y_score = bagging_clf.predict_proba(X_test)[:,1]
lw=2
plot_roc_curve(bagging_clf, X_test, y_test)
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.title('ROC Curve for Training data')
plt.show()
print(classification_report( y_test, y_pred, target_names=class_names))