In [1]:
################################################################################
#   Basic configuration steps
################################################################################

#- import basic python packages
import warnings
import tkinter # to show plot in Pycharm

#- import packages for data manipulations
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE

#- import packages for unsupervised machine learning
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA, SparsePCA
from sklearn.manifold import TSNE

#- import packages for supervised machine learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import RFE

#- import packages for model evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve, roc_curve, recall_score, precision_score, average_precision_score
from sklearn.metrics import auc, roc_auc_score, classification_report

#- import packages for visualizations
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.tools import FigureFactory as FF

#- packages created
from utils import corr_heatmap, tsne_plotting, pca_plotting
from utils import model_scores, multiple_roc_curves, create_scores_df, create_prec_recall_df, precision_recall


#- configuration steps
warnings.simplefilter(action='ignore', category = FutureWarning)
palette = sns.color_palette("PiYG_r")
sns.set_palette("Set3")

In [2]:
################################################################################
#   Read CSV data file
################################################################################
'''
- all the features of the data have already been PCA-transformed except 'time' and 'amount'
- since all features need to be scaled before PCA is implemented, we can assume that all 'V' features are scaled
'''

credit_df = pd.read_csv('creditcard.csv')
print("We look at a sample of the data: \n", credit_df.head())

#- let us scale the 'time' and 'amount' features using 'RobustScaler' which is less susceptible to outliers
#- RobustScaler removes the median and scales the data according to the IQR (i.e. between 1st quartile and 3rd quartile)
robust_scaler = RobustScaler()
credit_df["robustscaled_time"] = robust_scaler.fit_transform(credit_df["Time"].values.reshape(-1,1))
credit_df["robustscaled_amount"] = robust_scaler.fit_transform(credit_df["Amount"].values.reshape(-1,1))
credit_df.head()

#- we drop some of the columns
cols_to_drop = ["Time", "Amount", "robustscaled_time"]
X = credit_df.drop(cols_to_drop + ["Class"], axis = 1)
y = credit_df.loc[:,"Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print("Shape of the X_train dataset: ", X_train.shape)
print("Shape of the y_train dataset: ", y_train.shape)

We look at a sample of the data: 
    Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010  

In [3]:
################################################################################
#   Examine balance of class in dataset
################################################################################
#- plot a simple bar graph
class_counts_df = pd.DataFrame(credit_df.Class.value_counts()).sort_values(by = "Class").reset_index(drop = True)
class_counts_df["Class_name"] = ["Minority_class","Majority_class"]
class_counts_df.columns = ["Count", "Class_name"]

#- plot the graphs
fig1 = px.bar(class_counts_df, x = "Class_name", y = "Count", color = "Class_name", width = 600)
# fig1.show()

#- do a check on the actual counts
prop_result = class_counts_df.Count.min() / class_counts_df.Count.sum()
print("Minority class proportion: " + str(prop_result) + "%")
if prop_result < 1:
    print("We are dealing with a highly unbalanced dataset")
else:
    print("The dataset is not highly unbalanced")

Minority class proportion: 0.001727485630620034%
We are dealing with a highly unbalanced dataset


In [4]:
################################################################################
#   apply SMOTE technique to over-sample the minority class
################################################################################

#=== NOTE1: we apply SMOTE first, then apply train_test_split; apply SMOTE on training data only
#=== NOTE2: we apply SMOTE first, then execute cross-validation later

#- we have an unbalanced training dataset
print("Before OverSampling, counts of label 'Minority class' in training dataset: {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label 'Majority class' in training dataset: {} \n".format(sum(y_train == 0)))


#- we now implement the SMOTE technique
#- note that we need to ensure that "y_train" is continuous; so we use the "ravel" method to do that
smote = SMOTE(random_state = 88)
X_train_oversample, y_train_oversample = smote.fit_resample(X_train, y_train.ravel())

#- we now have a balanced training dataset
print("After OverSampling, counts of label 'Minority class' in training dataset: {}".format(sum(y_train_oversample == 1)))
print("After OverSampling, counts of label 'Majority class' in training dataset: {} \n".format(sum(y_train_oversample == 0)))

Before OverSampling, counts of label 'Minority class' in training dataset: 391
Before OverSampling, counts of label 'Majority class' in training dataset: 227454 

After OverSampling, counts of label 'Minority class' in training dataset: 227454
After OverSampling, counts of label 'Majority class' in training dataset: 227454 



In [7]:
#- we check out the new counts plot
y_train_oversample_df = pd.DataFrame(pd.DataFrame(y_train_oversample).value_counts()).reset_index(drop = True).sort_values(by = 0)
y_train_oversample_df["Class_name"] = ["Minority_class","Majority_class"]
y_train_oversample_df.columns = ["Count", "Class_name"]

#- plot the graphs
fig2 = px.bar(y_train_oversample_df, x="Class_name", y="Count", color = "Class_name", width = 600)
fig2.show()

py.iplot(fig2)

NameError: name 'py' is not defined

In [6]:
#- plot correlation plots on the oversampled training data
corr_heatmap(X_train_oversample)