In [None]:
import numpy as np
import pandas as pd

from patsy import dmatrices, dmatrix
import re
import pickle
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model, svm
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions
%matplotlib inline

# make prettier plots
%config InlineBackend.figure_format = 'svg' 

seed = 5
np.random.seed(seed)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
file = open('cleaned_cc_default_data', 'rb')
model_data = pickle.load(file)

In [None]:
model_data.head()

In [None]:
model_data.info()

# Splitting data into train/test & scaling

In [None]:
# Use Patsy to create my X Matrix
x_cols = ['age', 'sex', 'marital_status', 'education_level',  
        'bill_amt_1', 'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', 
        'pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6', 
        'pay_amt_1', 'pay_amt_2', 'pay_amt_3', 'pay_amt_4', 'pay_amt_5', 'pay_amt_6', 
         'limit_balance']

# # Add interactions
x_str = x_cols[0]
for i in x_cols[1:]:
    x_str = x_str + ' + ' + i
x_str = x_str + '' # if I want to create new variables, add in empty string

x_patsy = dmatrix(x_str, model_data)
x_patsy.shape

In [None]:
# Shuffle and then train/test split of my variables
# from sklearn.utils import shuffle

# x_raw = x_patsy
# y_raw = model_data['outcome_var']

#x_shuffle, y_shuffle = shuffle(x_raw, y_raw)
#x_mid, x_test, y_mid, y_test = train_test_split(x_shuffle, y_shuffle, test_size=0.2, random_state=0)
#x_train, x_val, y_train, y_val = train_test_split(x_mid, y_mid, test_size=0.25, random_state=0)

# Stratified train test split
x_raw = x_patsy
y_raw = np.array(model_data['default_payment_next_month'])

sss1 = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
# sss2 = StratifiedShuffleSplit(n_splits=2, test_size=0.25, random_state=0)

sss1.get_n_splits()
for train_index, test_index in sss1.split(x_raw, y_raw):
    x_train, x_test = x_raw[train_index,:], x_raw[test_index,:]
    y_train, y_test = y_raw[train_index], y_raw[test_index]
    
# use this later if I want to get fancy...
# sss2.get_n_splits()
# for train_index, test_index in sss2.split(x_mid, y_mid):
#     x_train, x_val = x_mid[train_index,:], x_mid[test_index,:]
#     y_train, y_val = y_mid[train_index], y_mid[test_index]

In [None]:
x_test.shape

In [None]:
x_train.shape


In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

In [None]:
# test train split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# setting a scaler

scaler = StandardScaler()

In [None]:
# splitting train data into features that will be scaled
# by dropping categorical features

X_for_scaling = X_train.drop(columns=['Intercept', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'support', 'Work_accident',
       'promotion_last_5years', 'technical'])

In [None]:
# scale features that need to be scaled

X_train_scaled = scaler.fit_transform(X_for_scaling)

In [None]:
# convert those features to data frame

X_train_scaled = pd.DataFrame(X_train_scaled)

In [None]:
# merge scaled features with non-scaled features

X_train = pd.merge(X_train_scaled, X_train.drop(columns=['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company', 'salary', 'int_term_1']).reset_index(drop=True), left_index=True, right_index=True)

In [None]:
# split test data into features that will be scaled by dropping categorical features (same process as train data)

X_for_scaling2 = X_test.drop(columns=['Intercept', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'support', 'Work_accident',
       'promotion_last_5years', 'technical'])

In [None]:
# scale test features

X_test_scaled = scaler.fit_transform(X_for_scaling2)

In [None]:
# convert those features to dataframe

X_test_scaled = pd.DataFrame(X_test_scaled)

In [None]:
# merge with categorical features that weren't scaled

X_test = pd.merge(X_test_scaled, X_test.drop(columns=['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company', 'salary', 'int_term_1']).reset_index(drop=True), left_index=True, right_index=True)

# Modeling Round I

- KNN
- Logistic Regression
- SVM
- Linear SVC
- Naive Bayes
- Decision Tree Classifier
- Random Forest
- XGBoost ?
- LightGBM ?
- Neural Net ?

In [None]:
# Helper function for printing confusion matrices (see: https://gist.github.com/shaypal5/94c53d765083101efc0240d776a23823)

def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=18):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names, )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig

### KNN GridSearchCV

In [None]:
# Run CV with 5 folds (knn)

ks = range(1,301,50)
param_grid = [{'n_neighbors': ks}]

knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
knn_grid.fit(x_norm_train, y_train)

In [None]:
# this is my best model based on above
# can run .predict(x_test)

knn_grid.best_estimator_

In [None]:
cm = print_confusion_matrix(confusion_matrix(y_train, knn_grid.predict(x_norm_train)), ['Class 0', 'Class 1'], figsize=(5, 4), fontsize=15)

### Logistic Regression GridSearchCV

In [None]:
## NOTES

# need to do np.exp(coefficients) to scale them back from log odds after you get coefficients
# also need to unscale (because originally scaled them)
# note: no need to scale categorical variables
# create a subset of dataframe; scale it; drop it back in
# for pay_# columns -- use label encoding, because you want to keep it valued as it is; do not scale either

In [None]:
# Run CV with 5 folds (logit)

penalty = ['l1', 'l2'] # look into doing elastic net here to see what combo of l1 and l2 is best
C = np.logspace(-3, 1, 100)
param_grid = dict(C=C, penalty=penalty)

logistic = linear_model.LogisticRegression(solver='liblinear', max_iter=10000)
logistic_grid = GridSearchCV(logistic, param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
logistic_grid.fit(x_norm_train, y_train)

In [None]:
# best model for logistic regression (metric = C)

logistic_grid.best_estimator_

In [None]:
cm = print_confusion_matrix(confusion_matrix(y_train, logistic_grid.predict(x_norm_train)), ['Class 0', 'Class 1'], figsize=(5, 4), fontsize=15)

In [None]:
rfe.support_

In [None]:
# print coefficients
# then get coefficients
# those coefficients will tell me which features are most important
# purpose of this is interpretation of what really contributed to my model (presentation)

logistic2 = linear_model.LogisticRegression(C=0.02848035868435802, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=10000,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False) # ADD PARAMETERS FROM BEST ESTIMATE
logistic2.fit(x_norm_train, y_train)

In [None]:
a = logistic2.coef_
# a_list = a.tolist()
# flat_list = [item for x in a_list for item in x]
# a_df = pd.DataFrame(flat_list)
# a_df

In [None]:
b = x_cols
# b_df = pd.DataFrame(b)
# b_df

In [None]:
# logistic_coef = pd.concat([b_df, a_df], ignore_index = True, axis = 1)
# logistic_coef

list(zip(a,b))

In [None]:
# because ROC-AUC scores between logit and RF are so close, can just opt to use logit since it will get rid of features (becuase it chose l1)
# or if I decide to use RF, can then do EDA on features of low importance to inuit which features are important vs. not and why

### SVM GridSearchCV

In [None]:
# Run CV with 5 folds (SVM)

C = np.logspace(-3, 1, 25)
gammas = np.logspace(-3, 0, 25)
param_grid = dict(C=C, gamma=gammas)

svm1 = svm.SVC(kernel='rbf', probability=True)
svm_grid = GridSearchCV(svm1, param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
svm_grid.fit(x_norm_train, y_train)

In [None]:
svm_grid.best_estimator_

In [None]:
cm = print_confusion_matrix(confusion_matrix(y_train, svm_grid.predict(x_norm_train)), ['Class 0', 'Class 1'], figsize=(5, 4), fontsize=15)

### Linear SVC

IS THE CODE CORRECT?

HOW TO ADD TO ROC/AUC/SCORING?

In [None]:
# Run CV with 5 folds (Linear SVC)

C = np.logspace(-3, 1, 25)
# gammas = np.logspace(-3, 0, 25)
param_grid = dict(C=C)
# param_grid = dict(C=C, gamma=gammas)


# LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
#      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
#      multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)

svc1 = LinearSVC()
svc_grid = GridSearchCV(svc1, param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
svc_grid.fit(x_norm_train, y_train)

In [None]:
svc_grid.best_estimator_

In [None]:
cm = print_confusion_matrix(confusion_matrix(y_train, svc_grid.predict(x_norm_train)), ['Class 0', 'Class 1'], figsize=(5, 4), fontsize=15)

### Naive Bayes Model

In [None]:
# Fit a Naive Bayes Model

gnb = GaussianNB()
gnb_best = gnb.fit(x_norm_train, y_train)

In [None]:
cm = print_confusion_matrix(confusion_matrix(y_train, gnb_best.predict(x_norm_train)), ['Class 0', 'Class 1'], figsize=(5, 4), fontsize=15)

### Decision Tree Classifier GridSearchCV

CORRECT CODE BELOW?

ADD TO ROC-AUC & SCORING

In [None]:
# Run CV with 5 folds (Decision Tree Classifier)

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [50, 100, None],
    'max_features': ['sqrt'], # what is this?
    'min_samples_leaf': [1, 2, 5, 10],
    'min_samples_split': [2, 3, 5, 10],
    'n_estimators': [100, 200, 400, 1000]
}

dectree = DecisionTreeClassifier()
dectree_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
dectree_grid.fit(x_norm_train, y_train)

In [None]:
dectree_grid.best_estimator_

In [None]:
cm = print_confusion_matrix(confusion_matrix(y_train, dectree_grid.predict(x_norm_train)), ['Class 0', 'Class 1'], figsize=(5, 4), fontsize=15)

### Random Forest GridSearchCV

In [None]:
# Run CV with 5 folds (Random Forest)

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [50, 100, None],
    'max_features': ['sqrt'], # what is this?
    'min_samples_leaf': [1, 2, 5, 10],
    'min_samples_split': [2, 3, 5, 10],
    'n_estimators': [100, 200, 400, 1000]
}

rf = RandomForestClassifier()
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
rf_grid.fit(x_norm_train, y_train)

In [None]:
rf_grid.best_estimator_

In [None]:
cm = print_confusion_matrix(confusion_matrix(y_train, rf_grid.predict(x_norm_train)), ['Class 0', 'Class 1'], figsize=(5, 4), fontsize=15)

In [None]:
# Random Forest Feature Importances

rf2 = RandomForestRegressor(n_estimators=200, max_depth = None)
rf2.fit(x_norm_train, y_train)
rf2.feature_importances_

In [None]:
# zip each feature importance weight with my columns

pd.DataFrame(zip(list(rf2.feature_importances_), model_data.columns)) # sort by 0 later

### Gradient Boosting Classifier GridSearchCV

In [None]:
parameter_grid={
    "loss":["deviance"],
    "learning_rate": [0.05, 0.075, 0.1],
    "min_samples_split": np.linspace(0.01, 0.1, 6),
    "min_samples_leaf": np.linspace(0.1, 0.5, 6),
    "max_depth":[10,20],
    "max_features":["sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[ 1.0],
    "n_estimators":[125]
    }
gbc_grid = GridSearchCV(GradientBoostingClassifier(), parameter_grid, cv=5, n_jobs=-1,scoring = 'roc_auc', verbose= True)
gbc_grid.fit(x_norm_train, y_train)

### XGBoost

In [None]:
# run later and do feature importances

### LightGBM

### Neural Net?

### Scoring the models

#### ROC_AUC

In [None]:
print('Best ROC_AUC for knn: %0.4f' % knn_grid.best_score_)
print('Best ROC_AUC for logit: %0.4f' % logistic_grid.best_score_)
print('Best ROC_AUC for svm: %0.4f' % svm_grid.best_score_)
print('Best ROC_AUC for rf: %0.4f' % rf_grid.best_score_)

#### F1

In [None]:
# score on F1

# from sklearn.metrics import f1_score

# y_true = 
# y_pred = 

# f1_score(y_true, y_pred, average='macro')  

# f1_score(y_true, y_pred, average='micro')  

# f1_score(y_true, y_pred, average='weighted')  

# f1_score(y_true, y_pred, average=None)


#### Best parameters

In [None]:
print('Best Params for knn: ', knn_grid.best_params_)
print('Best Patams for logit: ', logistic_grid.best_params_)
print('Best Params for svm: ', svm_grid.best_params_)
print('Best Params for rf: ', rf_grid.best_params_)

#### NOTE QUESTIONS IN BELOW COMMENTED CODE

In [None]:
# ROC for all the models

# ARE THE X/Y VARIABLES THE CORRECT ONES TO USE HERE?
# How does the ensembe work?

# How to add new models above into this? E.g., Linear SVC, 

model_list = [knn_grid.best_estimator_, 
              logistic_grid.best_estimator_, 
              svm_grid.best_estimator_, 
              gnb_best, 
              rf_grid.best_estimator_,
              'ensemble']
model_name = ['knn', 'logit', 'svm', 'n_bayes', 'random_forest', 'ensemble']

# Plot ROC curve for all my models
fig, ax = plt.subplots(figsize=(10,8))
for i, model in enumerate(model_list):
    if model == 'ensemble':
        w1 = 0.10
        w2 = 0.80
        y_pred = (w1*logistic_grid.best_estimator_.predict_proba(x_norm_test)[:,1] 
                  + w2*rf_grid.best_estimator_.predict_proba(x_norm_test)[:,1]
                  + (1-w1-w2)*gnb_best.predict_proba(x_norm_test)[:,1])
    else:
        y_pred = list(model.predict_proba(x_norm_test)[:,1])
    fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
    roc_auc = metrics.auc(fpr, tpr)
    plt.plot(fpr, tpr, label = (model_name[i] + ' AUC = %0.4f' % roc_auc))

plt.legend(loc = 'lower right')
plt.title('Receiver Operating Characteristic')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()