In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import uniform
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
import time

from keras import optimizers
from keras import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
df = pd.read_csv(r'C:\Users\tjowayne\Desktop\Model\credit_card.csv')
df = df.drop(columns=['ID'])
df.head()

# One-hot Encoding

In [None]:
''' Create dummy variables with only two values '''
df1= pd.get_dummies(df, columns=['EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'], 
                    prefix= ['EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'])
df1.columns.values

In [None]:
X = df1.drop('default.payment.next.month', axis=1)
y = df1['default.payment.next.month']

# Handling Imbalance Data Distribution

In [None]:
count_no_default = len(df[df['default.payment.next.month']==0])
count_default = len(df[df['default.payment.next.month']==1])
pct_of_no_default = count_no_default / (count_no_default + count_default)
print("percentage of no default is", pct_of_no_default*100)
pct_of_default = count_default / (count_no_default + count_default)
print("percentage of default", pct_of_default*100)

In [None]:
print(len(df1.loc[df1['default.payment.next.month'] == 1]))
print(len(df1.loc[df1['default.payment.next.month'] == 0]))

In [None]:
# Create Balanced Training Dataset
test = df1.loc[df1['default.payment.next.month'] == 1].sample(n=500,random_state=7)
test = test.append(df1.loc[df1['default.payment.next.month'] == 0].sample(n=500,random_state=7), sort=False)
test = test.sample(frac=1, random_state=14)
train = df1.drop(test.index)

In [None]:
# Creating input features and target variables
X_train = train.loc[:, train.columns != 'default.payment.next.month']
y_train = train.loc[:, train.columns == 'default.payment.next.month']
X_test = test.loc[:, test.columns != 'default.payment.next.month']
y_test = test.loc[:, test.columns == 'default.payment.next.month']

In [None]:
# train = df1.loc[df1['default.payment.next.month'] == 1].sample(n=3680,random_state=7)
# train = train.append(df1.loc[df1['default.payment.next.month'] == 0].sample(n=3680,random_state=7),sort=False)
# train = train.sample(frac=1, random_state=7)
# test = df1.drop(train.index)

# SMOTE

In [None]:
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

In [None]:
X_train_sm = np.array(X_train)
y_train_sm = np.array(y_train)
columns = X_train.columns

os = SMOTE(random_state=7)
X_train_res, y_train_res = os.fit_sample(X_train_sm, y_train_sm.ravel())

# we can Check the numbers of our data
print("Before OverSampling, counts of label '1': {}".format(sum(y_train_sm==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train_sm==0)))
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

X_train_res = pd.DataFrame(data=X_train_res, columns=columns )
y_train_res= pd.DataFrame(data=y_train_res, columns=['default.payment.next.month'])

In [None]:
for i in columns:
    X_train_res[i] = X_train_res[i].astype(X_train[i].dtypes)

# Random Forest

In [None]:
''' Feature importance function '''
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [None]:
def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

In [None]:
''' Impute the missing values and store the data as dependent and independent part '''
df_trn, y_trn, nas = proc_df(df1, 'default.payment.next.month')

In [None]:
X_train_rf = X_train
y_train_rf = y_train
X_test_rf = X_test
y_test_rf = y_test

In [None]:
# Random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train_rf, y_train_rf)
# Predictions
rfc_predict = rfc.predict(X_test_rf)
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')

In [None]:
''' Top 10 most important features for our current model '''
fi = rf_feat_importance(rfc, df_trn)
fi[:10]

In [None]:
''' Build a random forest model using only the features that have a feature importance greater than 0.005 '''
to_keep = fi[fi.imp>0.05].cols
X_train_rf = X_train_rf[to_keep.values]
X_test_rf = X_test_rf[to_keep.values]
print(len(to_keep))

In [None]:
# number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators = [int(x) for x in np.linspace(start = 1400, stop = 1800, num = 3)]

# number of features at every split
max_features = ['sqrt'] # 'auto'

# max depth
# max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
# max_depth.append(None)
max_depth = [int(x) for x in np.linspace(420, 500, num = 3)]
max_depth.append(None)

# minimum number of samples required to be at a leaf node
min_samples_leaf = [int(x) for x in np.linspace(3, 5, 3, endpoint=True)]

# Whether to use out-of-bag samples to estimate the generalization accuracy
oob_score = [True]

# minimum number of samples required to split an internal node
# min_samples_split = [x for x in np.linspace(0.1, 1.0, 10, endpoint=True)]

# create random grid
random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 
               'min_samples_leaf': min_samples_leaf, 'oob_score': oob_score}

cv = StratifiedKFold(n_splits=10, random_state=14, shuffle=True)

In [None]:
start_time = time.time()
# Grid search of parameters
rfc_grid = GridSearchCV(estimator = rfc, param_grid = random_grid, cv = cv.split(X_train_rf, y_train_rf),
                        verbose=5, n_jobs=-1)
# Fit the model
rfc_grid.fit(X_train_rf, y_train_rf)

end_time = time.time()
time_taken = end_time - start_time

# print results
print(rfc_grid.best_params_)

In [None]:
time_taken/3600

In [None]:
# Plug values back into the model to see if it improved our performance
rfc = RandomForestClassifier(n_estimators = 1400, min_samples_leaf = 5, max_features = 'sqrt', max_depth = 420,
                             oob_score = True)
rfc.fit(X_train_rf, y_train_rf)
rfc_predict = rfc.predict(X_test_rf)
rfc_cv_score = cross_val_score(rfc, X, y, cv=5, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test_rf, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test_rf, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

# Logistic Regression

In [None]:
X_train_lg = X_train_res
y_train_lg = y_train_res
X_test_lg = X_test
y_test_lg = y_test

log = df1.loc[df1['default.payment.next.month'] == 1].sample(n=5200,random_state=7)
log = log.append(df1.loc[df1['default.payment.next.month'] == 0].sample(n=5200,random_state=7),sort=False)
log = log.sample(frac=1, random_state=7)
X_log = log.drop('default.payment.next.month', axis=1)
y_log = log['default.payment.next.month']

In [None]:
logreg = LogisticRegression()
rfe = RFE(logreg, 20)
rfe = rfe.fit(X_log, y_log.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

In [None]:
cols = df1.columns.values[np.where(rfe.support_ == True)]

In [None]:
X_log1 = X_log[cols]
logit_model=sm.Logit(y_log, X_log1)
result=logit_model.fit(method='bfgs', maxiter=1000)
print(result.summary2())

In [None]:
# Remove variables with p-values larger than 0.05
cols = ['EDUCATION_3', 'PAY_0_1', 'PAY_0_2', 'PAY_0_3', 'PAY_0_8', 'PAY_2_2', 'PAY_2_3', 'PAY_3_2',
        'PAY_3_3', 'PAY_5_2', 'PAY_5_3']
X_log2 = X_log[cols]
logit_model=sm.Logit(y_log, X_log2)
result=logit_model.fit(method='bfgs', maxiter=1000)
print(result.summary2())

In [None]:
X_train_lg = X_train_lg[cols]
X_test_lg = X_test_lg[cols]
logreg = LogisticRegression()
logreg.fit(X_train_lg, y_train_lg)

In [None]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
# C = uniform(loc=0, scale=4)
C = np.logspace(-3,3,7)

fit_intercept = [True, False]

# Create hyperparameter options
hyperparameters = dict(C=C, fit_intercept=fit_intercept, penalty=penalty)

cv = StratifiedKFold(n_splits=10, random_state=14, shuffle=True)

In [None]:
start_time = time.time()
# Create grid search 10-fold cross validation
clf = GridSearchCV(estimator = logreg, param_grid = hyperparameters, cv = cv.split(X_train_lg, y_train_lg),
                   verbose=5, n_jobs=-1)
# Fit randomized search
best_model = clf.fit(X_train_lg, y_train_lg)

end_time = time.time()
time_taken = end_time - start_time

In [None]:
time_taken

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best fit_intercept:', best_model.best_estimator_.get_params()['fit_intercept'])

In [None]:
y_pred = best_model.predict(X_test_lg)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(best_model.score(X_test_lg, y_test_lg)))

In [None]:
confusion_matrix = confusion_matrix(y_test_lg, y_pred)
print("=== Confusion Matrix ===")
print(confusion_matrix)
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test_lg, y_pred))
print('\n')

In [None]:
logit_roc_auc = roc_auc_score(y_test_lg, logreg.predict(X_test_lg))
fpr, tpr, thresholds = roc_curve(y_test_lg, logreg.predict_proba(X_test_lg)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

# Neural Network

In [None]:
X_train_nn = X_train_res
y_train_nn = y_train_res
X_test_nn = X_test
y_test_nn = y_test

cols = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
       'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

In [None]:
# Scale numerical data -1 to 1
scaler = StandardScaler()
def scale_numerical(df,columns):
    for column in columns:
        df.loc[:,[column]] = scaler.fit_transform(df.loc[:,[column]])
    
    return df

In [None]:
#standardizing the input feature
X_test_nn = scale_numerical(X_test_nn, cols)
X_train_pca = scale_numerical(X_train_nn, cols)

In [None]:
pca = PCA(n_components=74, random_state=7)
pca.fit(X_train_pca)

In [None]:
#The amount of variance that each PC explains
var = pca.explained_variance_ratio_
print(var*100)

In [None]:
#Cumulative Variance explains
var1 = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1)
plt.plot(var1)

In [None]:
pca = PCA(n_components=14)
pca.fit(X_train_pca)
X_train_pca1 = pca.fit_transform(X_train_pca)
X_train_pca1

In [None]:
components_df = abs(pca.components_)
components_df = pd.DataFrame(components_df)

In [None]:
column_dict = {}
for i in range(len(components_df.columns)):
    column_dict[i] = X_train_nn.columns[i]
components_df.rename(columns=column_dict,inplace=True)

In [None]:
components_df.head()

In [None]:
var = pca.explained_variance_ratio_
components_df1 = components_df.copy()
for col in components_df1.columns:
    components_df1[col] = components_df1[col]*var

In [None]:
components_df['variance'] = var
components_df.T.sort_values(by=0,ascending=False).head()

In [None]:
var_weight = pd.DataFrame(components_df1.sum(axis=0)).reset_index()
var_weight.rename(columns={'index':'feature',0:'variance_weight'},inplace=True)
var_weight.sort_values(by='variance_weight',ascending=False,inplace=True)
var_weight.head(14)

In [None]:
cols = var_weight.feature.head(14).values

In [None]:
# X_train_nn = pca.transform(X_train_nn)
# X_test_nn = pca.transform(X_test_nn)
X_train_nn = X_train_pca[cols]
X_test_nn = X_test_nn[cols]

In [None]:
X_train_nn = X_train_pca

In [None]:
import tensorflow as tf
import keras.backend as K

In [None]:
# def f1(y_true, y_pred):
#     y_pred = K.round(y_pred)
#     tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
#     tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
#     fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
#     fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

#     p = tp / (tp + fp + K.epsilon())
#     r = tp / (tp + fn + K.epsilon())

#     f1 = 2*p*r / (p+r+K.epsilon())
#     f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
#     return K.mean(f1)

def f1(y_true, y_pred, beta=1, threshold=0.4):
    
    y_true = K.cast(y_true, 'float')
    y_pred = K.cast(K.greater(K.cast(y_pred, 'float'), threshold), 'float')

    tp = K.sum(y_true * y_pred, axis=0)
    fp = K.sum((1 - y_true) * y_pred, axis=0)
    fn = K.sum(y_true * (1 - y_pred), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = (1 + beta ** 2) * p * r / ((beta ** 2) * p + r + K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)

    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [None]:
def build_classifier(learn_rate=0.01, neurons=10):
    classifier = Sequential()
    #First Hidden Layer
    classifier.add(Dense(neurons, activation='relu', kernel_initializer='random_uniform', input_dim=74))
    #Second  Hidden Layer
    # classifier.add(Dense(neurons, activation='relu', kernel_initializer='random_uniform'))
    #Output Layer
    classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_uniform'))
    #Compiling the neural network
    optimizer = optimizers.Adam(lr=learn_rate)
    classifier.compile(optimizer=optimizer, loss=f1_loss, metrics =['accuracy', f1])
    return classifier

In [None]:
classifier = KerasClassifier(build_fn=build_classifier, verbose=5)

In [None]:
# batch_size = [10, 20, 40, 60, 80, 100]
# epochs = [10, 50, 100]
# learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
# neurons = [1, 5, 10, 15, 20, 25, 30]

batch_size = [40, 80, 100]
epochs = [10, 50, 100]
learn_rate = [0.01, 0.1, 0.3]
neurons = [15, 20, 25]
param_grid = dict(batch_size=batch_size, epochs=epochs, learn_rate=learn_rate, neurons=neurons)

cv = StratifiedKFold(n_splits=10, random_state=14, shuffle=True)

In [None]:
# grid = RandomizedSearchCV(estimator=classifier, param_distributions=param_grid, n_iter=10, random_state=7,
#                           cv = cv.split(X_train_nn, y_train_nn), verbose=5, n_jobs=-1)
# grid_result = grid.fit(X_train_nn, y_train_nn)
grid = GridSearchCV(estimator=classifier, param_grid=param_grid, cv = cv.split(X_train_nn, y_train_nn),
                    verbose=10, n_jobs=-1)
grid_result = grid.fit(X_train_nn, y_train_nn)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
y_pred = grid_result.best_estimator_.predict(X_test_nn)
print('Accuracy of neural net on test set: {:.2f}'.format(grid_result.best_estimator_.score(X_test_nn, y_test_nn)))

In [None]:
confusion_matrix = confusion_matrix(y_test_nn, y_pred)
print("=== Confusion Matrix ===")
print(confusion_matrix)
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test_nn, y_pred))
print('\n')