In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('UCI_Credit_Card.csv', index_col =0, na_values="")
df.head()

In [None]:
df['EDUCATION'] = df['EDUCATION'].replace([0,5,6],4)
print(df['EDUCATION'].value_counts())

fil = (df['MARRIAGE'] == 0)
df.loc[fil, 'MARRIAGE'] = 3
print(df['MARRIAGE'].value_counts())

df = df.rename({'PAY_0':'PAY_1'}, axis ='columns')

In [None]:
df.dtypes

In [None]:
def memory_usage(df, columns=5):
    print('Memory usage ----')
    memory_per_column = df.memory_usage(deep=True) / 1024 ** 2
    print(f'Top {columns} columns by memory (MB):')
    print(memory_per_column.sort_values(ascending=False) \
    .head(columns))
    print(f'Total size: {memory_per_column.sum():.4f} MB')
    
memory_usage(df)

In [None]:
%matplotlib inline
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df.describe().round().T

In [None]:
fig, ax = plt.subplots()
sns.distplot(df.loc[df.SEX==1, 'AGE'].dropna(),
    hist=False, color='blue',
    kde_kws={"shade": True},
    ax=ax, label='Male')
sns.distplot(df.loc[df.SEX==2, 'AGE'].dropna(),
    hist=False, color='red',
    kde_kws={"shade": True},
    ax=ax, label='Female')
ax.set_title('Distribution of age')
ax.legend(title='Gender:')

In [None]:
ax = sns.countplot('default.payment.next.month', hue='SEX',data=df, orient='h')
ax.set_title('Target variable distribution')

In [None]:
ax = sns.violinplot(x='EDUCATION', y='LIMIT_BAL',hue='SEX', split=True, data=df)
ax.set_title('Limit balance per education level distribution')

In [None]:
ax = df.groupby("EDUCATION")['default.payment.next.month'] \
    .value_counts(normalize=True) \
    .unstack() \
    .plot(kind='barh', stacked='True')
ax.set_title('Percentage of default per education level')
ax.legend(title='Default', bbox_to_anchor=(1,1))

In [None]:
sns.set(rc={'figure.figsize':(25,8)})
sns.set_context("talk", font_scale=0.7)
sns.heatmap(df.corr(), cmap='Oranges', annot=True)

In [None]:
pair_plot = sns.pairplot(df[['EDUCATION', 'LIMIT_BAL','default.payment.next.month']])
pair_plot.fig.suptitle('Pairplot', y=1.05)

In [None]:
#pip install pandas-profiling

In [None]:
#import pandas_profiling
#df.profile_report()

In [None]:
df_dum = pd.get_dummies(df, columns=["EDUCATION"], prefix=["Edu"] )
df_dum = df.merge(df_dum, how='outer')

df_dum2 = pd.get_dummies(df, columns=["SEX"], prefix=["SEX"] )
df_dum2 = df_dum.merge(df_dum2, how='outer')

df_dum3 = pd.get_dummies(df, columns=["MARRIAGE"], prefix=["MARRIAGE"] )
df_dum3 = df_dum2.merge(df_dum3, how='outer')

df_dum4 = pd.get_dummies(df, columns=["PAY_1"], prefix=["p1"] )
df_dum4 = df_dum3.merge(df_dum4, how='outer')

df_dum5 = pd.get_dummies(df, columns=["PAY_2"], prefix=["p2"] )
df_dum5 = df_dum4.merge(df_dum5, how='outer')

df_dum6 = pd.get_dummies(df, columns=["PAY_3"], prefix=["p3"] )
df_dum6 = df_dum5.merge(df_dum6, how='outer')

df_dum7 = pd.get_dummies(df, columns=["PAY_4"], prefix=["p4"] )
df_dum7 = df_dum6.merge(df_dum7, how='outer')

df_dum8 = pd.get_dummies(df, columns=["PAY_5"], prefix=["p5"] )
df_dum8 = df_dum7.merge(df_dum8, how='outer')

df_dum9 = pd.get_dummies(df, columns=["PAY_6"], prefix=["p6"] )
df_dum9 = df_dum8.merge(df_dum9, how='outer')

df_dum9 = df_dum9.drop(['SEX','EDUCATION','MARRIAGE','PAY_1','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'],axis=1)
print(df_dum9)

In [None]:
from sklearn import preprocessing as prep

In [None]:
minmax_scale = prep.MinMaxScaler().fit(df_dum9)
credit_minmax = minmax_scale.transform(df_dum9)
credit_minmax = pd.DataFrame(credit_minmax, columns = list(df_dum9))
credit_minmax

In [None]:
x = credit_minmax.drop(["default.payment.next.month"],axis=1)
y = credit_minmax["default.payment.next.month"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=21)

In [None]:
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

In [None]:
import missingno

In [None]:
x.info()

In [None]:
missingno.matrix(x)

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import scikitplot as skplt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import plot_confusion_matrix
from sklearn import datasets
from matplotlib import pyplot as plt

In [None]:
clf_gini = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth = 3, min_samples_leaf = 5)
clf_gini.fit(X_train, y_train)

prediction = clf_gini.predict(X_test)
print("Decision Tree Model Report")
report = classification_report(y_test, prediction)
print(report)

#Plot the Matrix
confusion_matrix = cm(y_test, prediction)
print(confusion_matrix)

skplt.metrics.plot_confusion_matrix(y_test, prediction)
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,prediction,normalize=True)
plt.show()

In [None]:
y_pred_prob = clf_gini.predict_proba(X_test)[:, 1]
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_prob)
print(metrics.auc(recall, precision))

In [None]:
ax = plt.subplot()
ax.plot(recall, precision,
label=f'PR-AUC = {metrics.auc(recall, precision):.2f}')
ax.set(title='Precision-Recall Curve',
xlabel='Recall',
ylabel='Precision')
ax.legend()

In [None]:
df_train = pd.concat([X_train, y_train],axis=1)
df_train

In [None]:
df_test = pd.concat([X_test, y_test],axis=1)
df_test

In [None]:
count_class_0, count_class_1 = df_train['default.payment.next.month'].value_counts()

df_majority = df_train[df_train['default.payment.next.month']==0]
df_minority = df_train[df_train['default.payment.next.month']==1]

df_minority_upsampled = df_minority.sample(count_class_0, replace=True)
df_upsampled = pd.concat([df_majority,df_minority_upsampled],axis=0)
 
print('Random Oversampling:')
print(df_upsampled['default.payment.next.month'].value_counts())
 

df_upsampled['default.payment.next.month'].value_counts().plot(kind='bar', title='Count (default.payment.next.month)');

In [None]:
X_train_upsampled = df_upsampled.drop(["default.payment.next.month"],axis=1)
y_train_upsampled = df_upsampled["default.payment.next.month"]

In [None]:
clf_gini = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth = 3, min_samples_leaf = 5)
clf_gini.fit(X_train_upsampled, y_train_upsampled)

prediction = clf_gini.predict(X_test)
print("Decision Tree Model Report")
report = classification_report(y_test, prediction)
print(report)

#Plot the Matrix
confusion_matrix = cm(y_test, prediction)
print(confusion_matrix)

skplt.metrics.plot_confusion_matrix(y_test, prediction)
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,prediction,normalize=True)
plt.show()

In [None]:
y_pred_prob = clf_gini.predict_proba(X_test)[:, 1]
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_prob)
print(metrics.auc(recall, precision))

ax = plt.subplot()
ax.plot(recall, precision,
label=f'PR-AUC = {metrics.auc(recall, precision):.2f}')
ax.set(title='Precision-Recall Curve',
xlabel='Recall',
ylabel='Precision')
ax.legend()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_jobs=1000,
                            random_state=9,
                            n_estimators=11,
                            verbose=False)

clf.fit(X_train_upsampled, y_train_upsampled)

prediction = clf.predict(X_test)
print("Decision Tree Model Report")
report = classification_report(y_test, prediction)
print(report)

#Plot the Matrix
confusion_matrix = cm(y_test, prediction)
print(confusion_matrix)

skplt.metrics.plot_confusion_matrix(y_test, prediction)
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,prediction,normalize=True)
plt.show()

y_pred_prob = clf.predict_proba(X_test)[:, 1]
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_prob)
print(metrics.auc(recall, precision))

ax = plt.subplot()
ax.plot(recall, precision,
label=f'PR-AUC = {metrics.auc(recall, precision):.2f}')
ax.set(title='Precision-Recall Curve',
xlabel='Recall',
ylabel='Precision')
ax.legend()

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

In [None]:
ns_probs = [0 for _ in range(len(y_test))]
lr_probs = clf.predict_proba(X_test)

lr_probs = lr_probs[:, 1]

ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_upsampled,y_train_upsampled)
pred = knn.predict(X_test)
print("Accuracy:")
response = accuracy_score(y_test,pred)
print(response)

confusion_matrix = cm(y_test, pred)
print(confusion_matrix)

skplt.metrics.plot_confusion_matrix(y_test, pred)
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,pred,normalize=True)
plt.show()

print(classification_report(y_test, pred))

y_pred_prob = knn.predict_proba(X_test)[:, 1]
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_prob)
print(metrics.auc(recall, precision))

ax = plt.subplot()
ax.plot(recall, precision,
label=f'PR-AUC = {metrics.auc(recall, precision):.2f}')
ax.set(title='Precision-Recall Curve',
xlabel='Recall',
ylabel='Precision')
ax.legend()

In [None]:
ns_probs = [0 for _ in range(len(y_test))]
lr_probs = knn.predict_proba(X_test)

lr_probs = lr_probs[:, 1]

ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_upsampled, y_train_upsampled)
prediction = logreg.predict(X_test)
print("Accuracy:")
response = accuracy_score(y_test,prediction)
print(response)

prediction = dict()
prediction['Logistic'] = logreg.predict(X_test)
print('f1 Score:' ,metrics.f1_score(y_test, prediction['Logistic']))

confusion_matrix = cm(y_test, prediction['Logistic'])
print(confusion_matrix)


skplt.metrics.plot_confusion_matrix(y_test, prediction['Logistic'])
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,prediction['Logistic'],normalize=True)
plt.show()

print(classification_report(y_test, prediction['Logistic']))

y_pred_prob = logreg.predict_proba(X_test)[:, 1]
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_prob)
print(metrics.auc(recall, precision))

ax = plt.subplot()
ax.plot(recall, precision,
label=f'PR-AUC = {metrics.auc(recall, precision):.2f}')
ax.set(title='Precision-Recall Curve',
xlabel='Recall',
ylabel='Precision')
ax.legend()

In [None]:
ns_probs = [0 for _ in range(len(y_test))]
lr_probs = logreg.predict_proba(X_test)

lr_probs = lr_probs[:, 1]

ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:
from sklearn import model_selection

outcome = []
model_names = []
models = [  
          ('DecTree', DecisionTreeClassifier()),
          ('RandomForest', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('LogReg', LogisticRegression()),]


In [None]:
from sklearn.model_selection import StratifiedKFold
k_fold_validation = StratifiedKFold(5, shuffle=True, random_state=42)
for model_name, model in models:
    k_fold_validation = model_selection.StratifiedKFold(5, shuffle=True, random_state=42)
    results = model_selection.cross_val_score(model, X_train_upsampled, y_train_upsampled, cv=k_fold_validation, scoring='accuracy')
    outcome.append(results)
    model_names.append(model_name)
    output_message = "%s| Mean=%f STD=%f" % (model_name, results.mean(), results.std())
    print(output_message)

In [None]:
from pprint import pprint
for model_name, model in models:
    print('\n',model,'Parameters currently in use:\n')
    pprint(model.get_params())

In [None]:
random_grid = {"n_estimators":[5,10,50,100,250],
               "max_depth":[2,4,8,16,32,None],
              'max_features': ['auto', 'sqrt'],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10],}
pprint(random_grid)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rfc = RandomForestClassifier()
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 150, cv = 9, verbose=2, random_state=42, n_jobs = -1)
rfc_random.fit(X_train_upsampled, y_train_upsampled)

In [None]:
rfc_random.best_params_

In [None]:
import numpy as np

In [None]:
clf = RandomForestClassifier(n_jobs=1000,
                            random_state=9,
                            n_estimators=11,
                            verbose=False)

clf.fit(X_train_upsampled, y_train_upsampled)

prediction = clf.predict(X_test)
print("Decision Tree Model Report")
report = classification_report(y_test, prediction)
print(report)

#Plot the Matrix
confusion_matrix = cm(y_test, prediction)
print(confusion_matrix)

skplt.metrics.plot_confusion_matrix(y_test, prediction)
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,prediction,normalize=True)
plt.show()

y_pred_prob = clf.predict_proba(X_test)[:, 1]
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_prob)
print(metrics.auc(recall, precision))

ax = plt.subplot()
ax.plot(recall, precision,
label=f'PR-AUC = {metrics.auc(recall, precision):.2f}')
ax.set(title='Precision-Recall Curve',
xlabel='Recall',
ylabel='Precision')
ax.legend()

In [None]:
clft = RandomForestClassifier(n_estimators = 100,
                             min_samples_split = 2,
                             min_samples_leaf = 1,
                             max_features = 'sqrt',
                             max_depth = None)

clft.fit(X_train_upsampled, y_train_upsampled)

prediction = clft.predict(X_test)
print("Decision Tree Model Report")
report = classification_report(y_test, prediction)
print(report)

#Plot the Matrix
confusion_matrix = cm(y_test, prediction)
print(confusion_matrix)

skplt.metrics.plot_confusion_matrix(y_test, prediction)
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,prediction,normalize=True)
plt.show()

y_pred_prob = clft.predict_proba(X_test)[:, 1]
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_prob)
print(metrics.auc(recall, precision))

ax = plt.subplot()
ax.plot(recall, precision,
label=f'PR-AUC = {metrics.auc(recall, precision):.2f}')
ax.set(title='Precision-Recall Curve',
xlabel='Recall',
ylabel='Precision')
ax.legend()

In [None]:
ns_probs = [0 for _ in range(len(y_test))]
lr_probs = clft.predict_proba(X_test)

lr_probs = lr_probs[:, 1]

ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:
from sklearn.inspection import permutation_importance
import shap

In [None]:
feature_names = X_train_upsampled.columns

In [None]:
rf_feat_imp = pd.DataFrame(clft.feature_importances_,
                            index=feat_names,
                            columns=['mdi'])
rf_feat_imp = rf_feat_imp.sort_values('mdi', ascending=False)
rf_feat_imp['cumul_importance_mdi'] = np.cumsum(rf_feat_imp.mdi)

In [None]:
clft.feature_importances_

In [None]:
from matplotlib import style
plt.styple.use('ggplot')
sorted_idx = clft.feature_importances_.argsort()
plt.figure(figsize=(10,15))
plt.barh(feature_names[sorted_idx], clft.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

plt.show()

In [None]:
explainer = shap.TreeExplainer(clft)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")