In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics

from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

## 1 Data Preparation

In [None]:
df = pd.read_csv('data/creditcard.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
pd.DataFrame(df['Class'].value_counts())

In [None]:
print('Fraud proportion takes up {}%'.format(round(df['Class'].value_counts().values[1]/len(df['Class']) * 100, 4)))

## 2 Data Processing 
### 2.1 Data Splitting and Standardizatioin

In [None]:
X = df.drop('Class', axis = 1)
y = df.Class
no_stand_X_features = X.columns

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [None]:
X_train

In [None]:
def standardization(df, name):
    new_name = name+'_stand'
    df[new_name]= (df[name] - df[name].mean()) / df[name].std()
    
    return df

# Standardize the X_train dataset
features_lst = X.columns
for i in features_lst:
    standardization(X_train, i)

In [None]:
def standardize_test(x_train, x_test, name):
    new_name = name+'_stand'
    x_test[new_name]= (x_test[name] - x_train[name].mean()) / x_train[name].std()
    
    return x_test

# Standardize the X_test dataset
for j in features_lst:
    standardize_test(X_train, X_test, j)

stand_X_features = [i+'_stand' for i in X.columns]

In [None]:
X_train.head()

In [None]:
X_test.head()

### 2.1 Handeling Imbalanced Dataset Methods
**SMOTE**: 

In [None]:
# # SMOTE
# sm = SMOTE(random_state = 0)
# X_sampled_smote, y_sampled_smote = sm.fit_resample(X_train, y_train)
# # X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_sampled_smote, y_sampled_smote, test_size = 0.2, random_state = 0)

# print("Before performing smote : ", Counter(y_train))
# print("After performing smote : ", Counter(y_sampled_smote))

## Supervised Learning 
**Logistic Regression** - ysy <br>
**K-Nearest Neighbors** - mql <br>
**Decision Tree** <br>
**Random Forest** - isha <br>
**Support Vector Machine Classifier (SVC)** <br>
**AdaBoost** <br>
**Gradient Boosting Classifier** - zzy <br>

Later, EDA and feature engineering are needed to add.

In [None]:
def confusion_matrix(test_y, pred_y):
    print("Classification accuracy is: \n", metrics.confusion_matrix(test_y, pred_y, normalize='all'))
    cf_matrix = metrics.confusion_matrix(test_y, pred_y)
    print("Confusion Matrix is: \n", cf_matrix)
    print("Classification report is: \n", classification_report(test_y, pred_y))
    
    group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    
    labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_counts, group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    
    sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=labels, fmt='', cmap='Blues')
    plt.title("Confusion Matrix with Normalization")
    plt.xlabel("Predicated Label")
    plt.ylabel("True Label")
    plt.show()

In [None]:
def model(features, model_classifier, model_name):
    print(model_name+' without using smote')
    model = model_classifier
    model.fit(X_train[features], y_train)
    
    y_pred = model.predict(X_test[features])
    y_pred_proba = model.predict_proba(X_test[features])[:, 1] # Keep probabilities for the positive outcome only
    
    confusion_matrix(y_test, y_pred)
    # Accuracy
    print("Accuracy (R^2) of Train Dataset: ", model.score(X_train[features], y_train))
    print("Accuracy (R^2) of Test Dataset: ", model.score(X_test[features], y_test))
    print('Accuracy score overall: ', metrics.accuracy_score(y_test, y_pred))
    print('Recall score is {}%'.format(round(metrics.recall_score(y_test, y_pred) * 100, 2)))
    
    print('\n'+model_name+' with using smote')
    smote_m = model_classifier
    
    # SMOTE
    pd_x_train, pd_x_test, pd_y_train, pd_y_test = train_test_split(X_train[features], y_train)
    sm = SMOTE(random_state = 0)
    X_sampled_smote, y_sampled_smote = sm.fit_resample(pd_x_train, pd_y_train)
    
    smote_m.fit(X_sampled_smote, y_sampled_smote)
    y_pred_smote = smote_m.predict(X_test[features])
    y_pred_proba_smote = smote_m.predict_proba(X_test[features])[:, 1]
    confusion_matrix(y_test, y_pred_smote)
    print('Validation test results:')
    print('Test Score:', smote_m.score(pd_x_test, pd_y_test))
    print('Accuracy Score:', metrics.accuracy_score(smote_m.predict(pd_x_test), pd_y_test))
    print('Recall Score:', metrics.recall_score(pd_y_test, smote_m.predict(pd_x_test)))
    
    print('\nTest Results:')
    print('Test Score:', smote_m.score(X_test[features], y_test))
    print('Accuracy Score:', metrics.accuracy_score(y_pred_smote, y_test))
    print('Recall Score:', metrics.recall_score(y_test, y_pred_smote))
    
    return y_pred, y_pred_proba, y_pred_smote, y_pred_proba_smote

In [None]:
def ROC(Y_test, Y_pred, Y_pred_prob, name):
    precision = metrics.precision_score(Y_test, Y_pred)
    recall = metrics.recall_score(Y_test, Y_pred)
    fprcat, tprcat, thresholds = metrics.roc_curve(Y_test, Y_pred_prob)
    AUC = metrics.auc(fprcat, tprcat)
    plt.figure(figsize=(5,4), dpi=256)
    plt.plot(fprcat, tprcat, 'b', label='AUC = %0.2f' % AUC)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    # plt.savefig(name + '.png')
    plt.show()

### Logistic Regression

In [None]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr_name = 'Logistic Regression'

In [None]:
# Case 1: without standardization
y_pred, y_pred_proba, y_pred_smote, y_pred_proba_smote = model(no_stand_X_features, lr, lr_name)
ROC(np.array(y_test), y_pred_smote, y_pred_proba_smote, 'LR_nosd_sm')
ROC(np.array(y_test), y_pred, y_pred_proba, 'LR_nosd_nosm')

In [None]:
# Case 2: with standardization
y_pred_lr, y_pred_proba_lr, y_pred_smote, y_pred_proba_smote = model(stand_X_features, lr, lr_name)
ROC(np.array(y_test), y_pred_smote, y_pred_proba_smote, 'LR_sd_sm')
ROC(np.array(y_test), y_pred, y_pred_proba, 'LR_sd_nosm')

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
rf_name = 'Random Forest'

In [None]:
# Case 1: without standardization
y_pred, y_pred_proba, y_pred_smote, y_pred_proba_smote = model(no_stand_X_features, rf, rf_name)
ROC(np.array(y_test), y_pred, y_pred_proba, 'RF_nosd_nosm')
ROC(np.array(y_test), y_pred_smote, y_pred_proba_smote, 'RF_nosd_sm')

In [None]:
# Case 2: with standardization
y_pred, y_pred_proba, y_pred_smote, y_pred_proba_smote = model(stand_X_features, rf, rf_name)
ROC(np.array(y_test), y_pred_smote, y_pred_proba_smote, 'RF_sd_sm')
ROC(np.array(y_test), y_pred, y_pred_proba, 'RF_sd_nosm')

## KNN

In [None]:
knn_name = 'KNeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=50)

In [None]:
# Case 1: without standardization
y_pred, y_pred_proba, y_pred_smote, y_pred_proba_smote = model(no_stand_X_features, knn, knn_name)
ROC(np.array(y_test), y_pred_smote, y_pred_proba_smote, 'knn_nosd_sm')
ROC(np.array(y_test), y_pred, y_pred_proba, 'knn_nosd_nosm')

In [None]:
# Case 2: with standardization
y_pred, y_pred_proba, y_pred_smote, y_pred_proba_smote = model(stand_X_features, knn, knn_name)
ROC(np.array(y_test), y_pred_smote, y_pred_proba_smote, 'knn_sd_sm')
ROC(np.array(y_test), y_pred, y_pred_proba, 'knn_sd_nosm')

## GBDT

In [None]:
gbdt = GradientBoostingClassifier()
gbdt_name = 'Gradient Boosting Classifier'

In [None]:
# Case 1: without standardization
y_pred, y_pred_proba, y_pred_smote, y_pred_proba_smote = model(no_stand_X_features, gbdt, gbdt_name)
ROC(np.array(y_test), y_pred_smote, y_pred_proba_smote, 'GBDT_nosd_sm')
ROC(np.array(y_test), y_pred, y_pred_proba, 'GBDT_nosd_nosm')

In [None]:
# Case 2: with standardization
y_pred, y_pred_proba, y_pred_smote, y_pred_proba_smote = model(stand_X_features, gbdt, gbdt_name)
ROC(np.array(y_test), y_pred_smote, y_pred_proba_smote, 'GBDT_sd_sm')
ROC(np.array(y_test), y_pred, y_pred_proba, 'GBDT_sd_nosm')