# Importing Data

In [1]:
import numpy as np
import os
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, RandomForestClassifier

# from xgboost import XGBClassifier
from sklearn.utils import resample

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from yellowbrick.datasets import load_concrete
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Python Scripts
from functions import *



## Encoding
Performing One-hot-encoding to keep interpretability

In [2]:
# Instatiating Encoder
ohe = OneHotEncoder(drop="first")

In [3]:
# Splitting Categorical Variables and Numeric Variables
cat_var = bank_df.select_dtypes(include="object")
num_var = bank_df.select_dtypes(exclude="object")

# Reseting Index
cat_var.reset_index(drop=True, inplace=True)
num_var.reset_index(drop=True, inplace=True)

In [4]:
# One hot encoding categorical variables
array_to_df = ohe.fit_transform(bank_df[cat_var.columns.tolist()]).toarray()  # Array values of the transformed columns

encoded = pd.DataFrame(array_to_df, columns=ohe.get_feature_names(cat_var.columns))  # Creating a pandas dataframe

bank_df = num_var.join(encoded, how="left")  # Combining the categorical variables and the numeric variables

In [5]:
bank_df.head()

Unnamed: 0,Current_Loan_Amount,Credit_Score,Annual_Income,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,...,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Purpose_wedding
0,99999999.0,741.0,2231892.0,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,217646.0,730.0,1184194.0,10855.08,19.6,10.0,13.0,1.0,122170.0,272052.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,548746.0,678.0,2559110.0,18660.28,22.6,33.0,4.0,0.0,437171.0,555038.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,99999999.0,728.0,714628.0,11851.06,16.0,76.0,16.0,0.0,203965.0,289784.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,99999999.0,740.0,776188.0,11578.22,8.5,25.0,6.0,0.0,134083.0,220220.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
bank_df.columns

Index(['Current_Loan_Amount', 'Credit_Score', 'Annual_Income', 'Monthly_Debt',
       'Years_of_Credit_History', 'Months_since_last_delinquent',
       'Number_of_Open_Accounts', 'Number_of_Credit_Problems',
       'Current_Credit_Balance', 'Maximum_Open_Credit',
       ...
       'Purpose_Medical Bills', 'Purpose_Other', 'Purpose_Take a Trip',
       'Purpose_major_purchase', 'Purpose_moving', 'Purpose_other',
       'Purpose_renewable_energy', 'Purpose_small_business',
       'Purpose_vacation', 'Purpose_wedding'],
      dtype='object', length=59020)

## Balancing Minority Data

In [None]:
# Fixing class imbalance
majority_df = bank_df[bank_df['Loan_Status_Fully Paid'] == 1]

minority_df = bank_df[bank_df['Loan_Status_Fully Paid'] == 0]
minority_df = resample(minority_df, replace=True, n_samples=13200, random_state=123)

bank_df = pd.concat([majority_df, minority_df])

In [None]:
# Preparing the X, y train test data
X = bank_df.drop(columns=['Loan_Status_Fully Paid'], axis=1)  # Independent Varaibles
y = bank_df['Loan_Status_Fully Paid']  # Dependent Variables

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=123)

## Scaling Data

In [None]:
bank_df.head()

# Modeling

### <span style="color:blue">------------------------------------------Base Line Model Logistic Regresssion------------------------------------------ </span>

Model can only do slightly better than random guesses.

In [None]:
# Logistic Regression Model
logr_clf = LogisticRegression()
logr_clf.fit(X_train, y_train)
y_pred = logr_clf.predict(X_train)

print('Train Accuracy', logr_clf.score(X_train, y_train))
print('Test Accuracy', logr_clf.score(X_test, y_test))

# Graphing
fig, ax = plt.subplots(figsize=(6, 15))
ax.barh(y=X_train.columns, width=logr_clf.coef_[0]);

In [None]:
log_coef = logr_clf.coef_.reshape(-1)

# Figure size
plt.figure(figsize=(15, 10))

# Seaborn Barplot
sns.barplot(x=log_coef, y=X_train.columns)
plt.show()

### <span style="color:blue">------------------------------------------ Random Forest------------------------------------------</span>

In [None]:
# Random Forest Model
rf_clf = RandomForestClassifier(n_estimators= 300, 
                                criterion= 'gini', 
                                max_depth= 10, 
                                max_features= 'auto',
                                oob_score= True)
rf_clf.fit(X_train, y_train.ravel())

# Finding the accuracy of train and test
accuracy_train = rf_clf.score(X_train, y_train)
accuracy_test = rf_clf.score(X_test, y_test)
oob = rf_clf.oob_score_

print('Train Accuracy', accuracy_train)
print('Test Accuracy', accuracy_test)
print('OOB Score', oob)

# Graphing
fig, ax = plt.subplots(figsize=(6, 10))
ax.barh(width=rf_clf.feature_importances_, y=X_train.columns);

In [None]:
# Figure size
plt.figure(figsize=(15, 10))

# Seaborn Barplot
sns.barplot(x=rf_clf.feature_importances_, y=X_train.columns)
plt.show()

### <span style="color:blue">------------------------------------------ GradientBoostingClassifier------------------------------------------</span>

In [None]:
# Gradient Boost Model
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_train, y_train)

# Accuracy of train and test
accuracy_train = gbc.score(X_train, y_train.ravel())
accuracy_test = gbc.score(X_test, y_test.ravel())

print('Train Accuracy', accuracy_train)
print('Test Accuracy', accuracy_test)

# Graphing
fig, ax = plt.subplots(figsize=(6, 10))
ax.barh(width=gbc.feature_importances_, y=X_train.columns);

In [None]:
# Figure size
plt.figure(figsize=(15, 10))

# Seaborn Barplot
sns.barplot(x=gbc.feature_importances_, y=X_train.columns)
plt.show()

### <span style="color:blue">------------------------------------------AdaBoostClassifier------------------------------------------</span>

In [None]:
# Adaptation Boost Model
abc = AdaBoostClassifier(random_state=42)
abc.fit(X_train, y_train.ravel())

# Accuracy of train and test
accuracy_train = abc.score(X_train, y_train)
accuracy_test = abc.score(X_test, y_test)

print('Train Accuracy', accuracy_train)
print('Test Accuracy', accuracy_test)

# Graphing
fig, ax = plt.subplots(figsize=(6, 10))
ax.barh(width=abc.feature_importances_, y=X_train.columns);

In [None]:
# Figure size
plt.figure(figsize=(15, 10))

# Seaborn Barplot
sns.barplot(x=abc.feature_importances_, y=X_train.columns)
plt.show()

### <span style="color:blue">------------------------------------------XGBoostClassifier------------------------------------------</span>

In [None]:
# Extreme Boost Model
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)

# Accuracy of train and test
accuracy_train = xgbc.score(X_train, y_train.ravel())
accuracy_test = xgbc.score(X_test, y_test.ravel())

print('Train Accuracy', accuracy_train)
print('Test Accuracy', accuracy_test)

# Graphing
fig, ax = plt.subplots(figsize=(6, 10))
ax.barh(width=xgbc.coef_[0], y=X_train.columns);