## Final Project Submission

Please fill out:
* Student name: Kelvin Waters
* Student pace: online-ds-ft-012120
* Scheduled project review date/time: 
* Instructor name: Abhineet Kulkarni
* Blog post URL: 

All imports will be moved here upon completion of the notebook 

In [None]:
# imports required for the notebook
import pandas as pd 
import numpy as np
import xgboost as xgb
import winsound

from scipy import stats
from importlib import reload
from pandas.plotting import scatter_matrix

from sklearn.metrics import (accuracy_score, roc_auc_score, 
confusion_matrix, precision_score, f1_score, roc_curve, auc, plot_roc_curve, classification_report)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from joblib import dump, load
from sklearn import svm, tree
from IPython.core.interactiveshell import InteractiveShell

import pickle
import sklearn.metrics as metrics
import itertools

from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# show all cell output 
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# importing the data
df = pd.read_csv('creditcard.csv')
# first five rows of the dataset
df.head() # normalized dataset 

checking initial shape

In [None]:
# good amount of data for ML
df.shape

all continuous except target variable

In [None]:
df.info()

In [None]:
# taking note of min - max values
df.describe()

In [None]:
# no missing data
df.isna().sum()

In [None]:
# checking outliers based on zscore < 3 value
# not sure if taking zscore values on a normalized dataset
z = np.abs(stats.zscore(df))
print(z)

In [None]:
# this would remove all the 1 values from the target Class!
# df = df[(z < 3).all(axis= 1)]
# loss of 37,864 rows
df.shape

In [None]:
# features, dependant variable
# data header 'cleaned' possibly for privacy concerns, would be great to know
# what features we're dealing with here
df.columns

dataset is highly imbalanced!

In [None]:
# splitting the data
X = df.iloc[:, :-1] # all rows and minus the last column
y = df.Class # only the Class column/Series as our target
print(y.value_counts())

split data

In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state= 42)

In [None]:
# initial shape pre SMOTING
X_train.shape

### Logistic Regression (on unbalanced dataset)

In [None]:
# instantiate model
# default values
logreg = LogisticRegression()

# fit model
logreg.fit(X_train, y_train)

# generate predictions
y_hat_train = logreg.predict(X_train)
y_hat_test = logreg.predict(X_test)

# how many times was the classifier correct on training?
residuals = np.abs(y_train - y_hat_train)

# how many times correct on test set? 
residuals = np.abs(y_test - y_hat_test)

print(pd.Series(residuals).value_counts())
print('------------------------------------')
print(pd.Series(residuals).value_counts(normalize= True))

In [None]:
# Logistic Regression imbalanced
confusion_matrix(y_test, y_hat_test)
print('--------------------------------')
print(classification_report(y_test, y_hat_test))

Logistic Regression ROC Curve (imbalanced dataset)

In [None]:
# ROC Curve imbalanced dataset

y_score = logreg.fit(X_train, y_train).decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(8, 6))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

What options are available to address the imbalance? 
1. We can undersample '0' values, which would slash a HUGE amount of data points from the dataset. 
2. We can oversample '1' values which would double to the amount of data points on the dataset

SMOTE: synthetic minority oversampling technique

In [None]:
# instantiate a smote obj
smote = SMOTE(random_state= 42) 

# apply smote to training data NOT the testing data
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)

# verifying the 'smoting' results!
# more data points is preferable than fewer data points when it comes to machine learning
print('Before SMOTE :', Counter(y_train))
print('After SMOTE :', Counter(y_train_smote))

In [None]:
X_train_smote.shape

In [None]:
y_train_smote.shape

### Decision Tree Classifier

In [None]:
# instantiate model
tree = DecisionTreeClassifier(criterion= 'entropy', splitter= 'best')

X_train_smote, X_test, y_train_smote, y_test = train_test_split(X, y, test_size= 0.20, random_state= 42)

# fit model
tree.fit(X_train_smote, y_train_smote)

# predict
y_pred = tree.predict(X_test)

acc= accuracy_score(y_test, y_pred) * 100

print('Accuracy is: {0}'.format(acc))

# Check AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)


fpr, tpr, _ = roc_curve(y_test, y_pred)

plt.clf()
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

In [None]:
# Decision Tree Confusion Matrix
confusion_matrix(y_test, y_pred)
print('----------------------------')
print(classification_report(y_test, y_pred))

In [None]:
# obscured feature v17 an imporatant variable would be good to know what this actually is!
def plot_feature_importances(model):
    n_features = X_train_smote.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train_smote.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    
plot_feature_importances(tree)

Decision Treee GridSearchCV

In [None]:
# # default values are first in each dict list
# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'splitter': ['best', 'random'],
# }
# gs_tree = GridSearchCV(tree, param_grid, cv=5) # 5 K fold
# gs_tree.fit(X_train_smote, y_train_smote)
# gs_tree.best_params_

# {'criterion': 'entropy', 'splitter': 'best'} results! 

Decision Tree ROC curve

In [None]:
# # Deicsion Tree confusion matrix
# cnf_matrix = confusion_matrix(y_test, y_hat_test)
# print('Confusion Matrix:\n', cnf_matrix)
# print(classification_report(y_test, y_hat_test))

### LogisticRegression Classifier (balanced)

In [None]:
# instantiate model
logreg_bal = LogisticRegression(penalty= 'l1', C= 1e12, solver= 'liblinear')

# fit model
logreg_bal.fit(X_train_smote, y_train_smote)

# generate predictions
y_hat_train = logreg_bal.predict(X_train_smote)
y_hat_test = logreg_bal.predict(X_test)

# how many times was the classifier correct on training?
residuals = np.abs(y_train_smote - y_hat_train)

# how many times correct on test set? 
residuals = np.abs(y_test - y_hat_test)

print(pd.Series(residuals).value_counts())
print('------------------------------------')
print(pd.Series(residuals).value_counts(normalize= True))


# default parameters
# 0    55834
# 1     1128
# Name: Class, dtype: int64
# ------------------------------------
# 0    0.980197
# 1    0.019803
# Name: Class, dtype: float64
# array([[55746,  1118],
#        [   10,    88]], dtype=int64)

In [None]:
# Logistic Regression 
confusion_matrix(y_test, y_hat_test)
print('----------------------------')
print(classification_report(y_test, y_hat_test))

Logistic Regresstion ROC curve (balance dataset)

In [None]:
# ROC Curve imbalanced dataset

y_score = logreg.fit(X_train_smote, y_train_smote).decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(8, 6))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

LogisticRegression GridSearchCV

In [None]:
# # default values are first in each dict list
# param_grid = {
#     'penalty': ['l2', 'l1', 'elasticnet'],
#     'C': [1.0, 1e12, 2e12],
#     'solver': ['lbfgs', 'saga', 'liblinear'],
# }
# gs_logreg = GridSearchCV(logreg, param_grid, cv=5) # 5 K fold
# gs_logreg.fit(X_train_smote, y_train_smote)
# gs_logreg.best_params_

# # {'C': 1000000000000.0, 'penalty': 'l1', 'solver': 'liblinear'} results!

### RandomForest Classifier

In [None]:
# Instantiate and fit a RandomForestClassifier
forest = RandomForestClassifier(criterion= 'entropy', n_estimators= 100, n_jobs= None)

# fit
forest.fit(X_train_smote, y_train_smote)

# predict
forest_predict = forest.predict(X_test) # this can't be right, smote isn't applied to testing values

# train accuracy score
forest.score(X_train_smote, y_train_smote)

# test accuracy score
forest.score(X_test, y_test)      

# default 1.0

In [None]:
# RandomForest Classifier
cnf_matrix = confusion_matrix(y_test, forest_predict) # y_test true, forest_predict
print('Confusion Matrix:\n', cnf_matrix)
print('---------')
print(classification_report(y_test, forest_predict))

RandomForest GridSearchCV

In [None]:
# # default values are first in each dict list
# param_grid = {
#     'n_estimators': [10, 100],
#     'criterion': ['gini', 'entropy'],
#     'n_jobs': [None, -1]
# }
# gs_forest = GridSearchCV(forest, param_grid, cv=5) # 5 K fold
# gs_forest.fit(X_train_smote, y_train_smote)
# gs_forest.best_params_

# {'criterion': 'entropy', 'n_estimators': 100, 'n_jobs': None} results! 

Random Forest ROC curve

In [None]:
# # Random Forest ROC Curve
# ax= plt.gca()

# forest_disp = plot_roc_curve(forest, X_test, y_test, ax=ax, alpha=0.8)
# svc_disp.plot(ax=ax, alpha=0.8)
# plt.show()

 ### XGBoost Classifier

In [None]:
# instantiate XGBoost Classifier
xgb = xgb.XGBClassifier(booster = 'gbtree')

# fit
xgb.fit(X_train_smote, y_train_smote)

# train
training_pred = xgb.predict(X_train_smote)
val_pred = xgb.predict(X_test)

# accuracy
training_accuracy = accuracy_score(y_train_smote, training_pred)
val_accuracy = accuracy_score(y_test, val_pred)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

In [None]:
# XGBoost confusion_matrix
cnf_matrix = confusion_matrix(y_test, val_pred) # y_test being 'true' and val_pred being y_prediction
print('Confusion Matrix:\n', cnf_matrix)
print('---------')
print(classification_report(y_test, val_pred))

XGB GridSearchCV

In [None]:
# cell alarm when complete
duration = 7000  # millisecond
freq = 440  # Hz
winsound.Beep(freq, duration)

In [None]:
# # default values are first in each dict list
# param_grid = {
#     'booster': ['gbtree', 'gblinear', 'dart'],
# }
# gs_xgb = GridSearchCV(xgb, param_grid, cv=5) # 5 K fold
# gs_xgb.fit(X_train_smote, y_train_smote)
# gs_xgb.best_params_

# {'booster': 'gbtree'} # results! 

#### Abandoned! SMOTE via pipeline!

In [None]:
"""This would never complete when referencing the created balanced dataset, google search revealed that there 
maybe issues with smote running on a single core, could not find any workarounds. I'm not even using column_transformer 
for dummies or any type of scaler. There's an apparent bottle neck somewhere since it runs without issue on the imbalanced 
data! I suspect an issue with SMOTE"""

# # logistic Regression
# pipe_lr = Pipeline([('pca', PCA(n_components= 2)),
#             ('clf', LogisticRegression(random_state= 42))])
# # SVM
# pipe_svm = Pipeline([('pca', PCA(n_components= 2)),
#             ('clf', svm.SVC(random_state= 42))])
# # Decision Tree            
# pipe_dt = Pipeline([('pca', PCA(n_components= 2)),
#             ('clf', tree.DecisionTreeClassifier(random_state= 42))])


# # List of pipelines for ease of iteration
# pipelines = [pipe_lr, pipe_svm, pipe_dt]

# # Dictionary of pipelines and classifier types for ease of reference
# pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree'}

# # Fit the pipelines
# for pipe in pipelines:
#     pipe.fit(X_train_smote, y_train_smote) # smote taking extremely long

# # Compare accuracies
# for idx, val in enumerate(pipelines):
#     print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))

# # Identify the most accurate model on test data
# best_acc = 0.0
# best_clf = 0
# best_pipe = ''
# for idx, val in enumerate(pipelines):
#     if val.score(X_test, y_test) > best_acc:
#         best_acc = val.score(X_test, y_test)
#         best_pipe = val
#         best_clf = idx
# print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

# # Save pipeline to file
# joblib.dump(best_pipe, 'best_pipeline.pkl', compress=1)
# print('Saved %s pipeline to file' % pipe_dict[best_clf])

In [None]:
# # import pickle
# with open('best_pipeline.pkl', 'rb') as p_f:
#     data = pickle.load(p_f)