# Lending Club - Loan Approval Process Optimization

# 4. Modeling

## 4.1 Imports

In [1]:
# Import the libraries necessary for the current task
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
from sklearn import __version__ as sklearn_version
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# pandas Configuration
pd.set_option("max_rows", 120)
pd.set_option("max_columns", 120)
pd.set_option("display.max_colwidth", None)

## 4.2 Data

In [2]:
# Load the CSV data
LC_data = pd.read_csv(r"C:\Users\lastr\Desktop\GitHub\Lending_Club_Capstone\data\lending_club_loans_preprocessed.csv")

In [3]:
# Convert earliest_cr_line, last_credit_pull_d to datetime again
LC_data['earliest_cr_line'] = pd.to_datetime(LC_data['earliest_cr_line'])
LC_data['last_credit_pull_d'] = pd.to_datetime(LC_data['last_credit_pull_d'])

In [4]:
# Convert earliest_cr_line, last_credit_pull_d to ordinal numeric
LC_data['earliest_cr_line'] = LC_data['earliest_cr_line'].apply(lambda x: x.toordinal())
LC_data['last_credit_pull_d'] = LC_data['last_credit_pull_d'].apply(lambda x: x.toordinal())

In [5]:
# Summary of the data
LC_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39239 entries, 0 to 39238
Data columns (total 92 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            39239 non-null  float64
 1   installment                          39239 non-null  float64
 2   grade                                39239 non-null  int64  
 3   emp_length                           39239 non-null  int64  
 4   annual_inc                           39239 non-null  float64
 5   loan_type                            39239 non-null  int64  
 6   dti                                  39239 non-null  float64
 7   delinq_2yrs                          39239 non-null  float64
 8   earliest_cr_line                     39239 non-null  int64  
 9   inq_last_6mths                       39239 non-null  float64
 10  open_acc                             39239 non-null  float64
 11  pub_rec                     

In [6]:
# First 5 entries of the data
LC_data.head()

Unnamed: 0,loan_amnt,installment,grade,emp_length,annual_inc,loan_type,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,last_credit_pull_d,pub_rec_bankruptcies,fico_range_avg,term_ 36 months,term_ 60 months,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,addr_state_AK,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_IA,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY
0,5000.0,162.87,1,10,24000.0,1,27.65,0.0,724642,1.0,3.0,0.0,13648.0,83.7,9.0,736208,0.0,737.0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2500.0,59.83,2,0,30000.0,0,1.0,0.0,729845,5.0,3.0,0.0,1687.0,9.4,4.0,736208,0.0,742.0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2400.0,84.33,2,10,12252.0,1,8.72,0.0,730790,2.0,2.0,0.0,2956.0,98.5,10.0,736208,0.0,737.0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10000.0,339.31,2,10,49200.0,1,20.0,0.0,728690,1.0,10.0,0.0,5598.0,21.0,37.0,736055,0.0,692.0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5000.0,156.46,0,3,36000.0,1,11.2,0.0,731886,3.0,9.0,0.0,7963.0,28.3,12.0,735964,0.0,732.0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
LC_data["loan_type"].value_counts()

1    33586
0     5653
Name: loan_type, dtype: int64

## 4.3 Models

In [8]:
# Training, Test Split
X_train, X_test, y_train, y_test = train_test_split(
    LC_data.drop("loan_type", axis=1), LC_data["loan_type"], test_size=0.2, random_state=100)

In [9]:
# Standardize features
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:
# Oversampling with SMOTE
smote = SMOTE(random_state=100)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [11]:
# A separate function for reporting results
def result(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Training Results:\n")
        print(f"Accuracy Score: {accuracy_score(y_train, clf.predict(X_train))}\n")
        print(f"Classification Report:\n{classfication_report(y_train, clf.predict(X_train))}\n")
        print(f"Confusion Matrix:\n{confusion_matrix(y_train, clf.predict(X_train))}\n")
        
        cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy")
        
        print(f"Average Accuracy: {np.mean(cv_score)}")
        print(f"Accuracy Standard Deviation: {np.std(cv_score)}")
    
    elif train == False:
        print("Test Results:\n")
        print(f"Accuracy Score: {accuracy_score(y_test, clf.predict(X_test))}\n")
        print(f"Classification Report:\n{classification_report(y_test, clf.predict(X_test))}\n")
        print(f"Confusion Matrix:\n{confusion_matrix(y_test, clf.predict(X_test))}\n")

### 4.3.1 Logistic Regression

#### 4.3.1.1 Hyperparameter Tuning Using Grid Search

In [12]:
# Parameter Grid for Grid Search
lr_param_grid = {
    "C": [0.0001, 0.001, 0.01, 1, 10, 100, 1000],
    "penalty": ["l1", "l2"],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

# Perform Grid Search
lr = LogisticRegression()
lr_gs = GridSearchCV(estimator=lr, param_grid=lr_param_grid, n_jobs=-1, refit=True, verbose=3, cv=3)
lr_gs_result = lr_gs.fit(X_train, y_train)

print(f"Best score: {lr_gs_result.best_score_} using {lr_gs_result.best_params_}")

Fitting 3 folds for each of 70 candidates, totalling 210 fits


 0.75070721 0.74806485 0.7506886  0.75070721        nan        nan
 0.7565873         nan 0.77432079 0.78397849 0.78397849 0.77807972
 0.78397849 0.78397849        nan        nan 0.79564577        nan
 0.7970972  0.79331975 0.79333835 0.79197997 0.79333835 0.79331975
        nan        nan 0.79564575        nan 0.79560853 0.79558992
 0.79558992 0.79558992 0.79557131 0.79558992        nan        nan
 0.79557131        nan 0.79560853 0.79558992 0.79560853 0.79558992
 0.7955527  0.79558992        nan        nan 0.79557131        nan
 0.79558992 0.79558992 0.79557131 0.79560853 0.7955527  0.79560853
        nan        nan 0.79558992        nan 0.79558992 0.79560853
 0.79558992 0.79560853 0.7955527  0.79558992]


Best score: 0.7970972020761291 using {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}


#### 4.3.1.2 Performance on Test Set

In [13]:
# Perform Logistic Regression 
lr = LogisticRegression(penalty="l1", C=0.01, random_state=100, solver="saga")
lr.fit(X_train, y_train)
print(f"Logistic regression test results with the tuned hyperparameters:\n")
result(lr, X_train, y_train, X_test, y_test, train=False)

Logistic regression test results with the tuned hyperparameters:

Test Results:

Accuracy Score: 0.7097349643221202

Classification Report:
              precision    recall  f1-score   support

           0       0.32      0.88      0.47      1132
           1       0.97      0.68      0.80      6716

    accuracy                           0.71      7848
   macro avg       0.64      0.78      0.63      7848
weighted avg       0.88      0.71      0.75      7848


Confusion Matrix:
[[ 991  141]
 [2137 4579]]



### 4.3.2 Random Forest

#### 4.3.2.1 Hyperparameter Tuning Using Random Search

In [14]:
# Parameter Grid for Random Search
rf_param_grid = {
    "n_estimators": [20, 40, 60, 80, 100],
    "max_depth": [5, 10, 15, 25, 30],
    "min_samples_split": [2, 5, 10, 15, 25],
    "min_samples_leaf": [1, 2, 5, 10]
}

# Perform Random Search
rf = RandomForestClassifier()
rf_rs = RandomizedSearchCV(estimator=rf, param_distributions=rf_param_grid, n_jobs=-1, refit=True, verbose=3, cv=3)
rf_rs_result = rf_rs.fit(X_train, y_train)

print(f"Best score: {rf_rs_result.best_score_} using {rf_rs_result.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best score: 0.8944186480339084 using {'n_estimators': 40, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 30}


#### 4.3.2.2 Performance on Test Set

In [15]:
# Perform Random Forest
rf = RandomForestClassifier(n_estimators=40, min_samples_split=5, min_samples_leaf=2, max_depth=30,  random_state=100)
rf.fit(X_train, y_train)
print(f"Random forest test results with the tuned hyperparameters:\n")
result(rf, X_train, y_train, X_test, y_test, train=False)

Random forest test results with the tuned hyperparameters:

Test Results:

Accuracy Score: 0.8527013251783894

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.52      0.51      1132
           1       0.92      0.91      0.91      6716

    accuracy                           0.85      7848
   macro avg       0.70      0.72      0.71      7848
weighted avg       0.86      0.85      0.85      7848


Confusion Matrix:
[[ 592  540]
 [ 616 6100]]



### 4.3.3 Support Vector Machine

In [16]:
# Parameter Grid for Random Search
svm_param_grid = {
    "C": [0.0001, 0.001, 0.01, 1, 10],
    "gamma": [0.0001, 0.001, 0.01, 1]
}

# Perform Random Search
svm = SVC()
svm_rs = RandomizedSearchCV(estimator=svm, param_distributions=svm_param_grid, n_jobs=-1, refit=True, verbose=3, cv=3)
svm_rs_result = svm_rs.fit(X_train, y_train)

print(f"Best score: {svm_rs_result.best_score_} using {svm_rs_result.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best score: 0.8800333210465978 using {'gamma': 1, 'C': 10}


In [17]:
# Perform SVM
svm = SVC(C=10, gamma=1, random_state=100)
svm.fit(X_train, y_train)
print(f"SVM test results with the tuned hyperparameters:\n")
result(svm, X_train, y_train, X_test, y_test, train=False)

SVM test results with the tuned hyperparameters:

Test Results:

Accuracy Score: 0.8541029561671764

Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.01      0.02      1132
           1       0.86      1.00      0.92      6716

    accuracy                           0.85      7848
   macro avg       0.59      0.50      0.47      7848
weighted avg       0.78      0.85      0.79      7848


Confusion Matrix:
[[  11 1121]
 [  24 6692]]



### 4.3.4 XGBoost

In [18]:
import xgboost as xgb

D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)

xgb_param_grid = {
     "eta"    : [0.05, 0.10, 0.15, 0.20] ,
     "max_depth"        : [2, 4, 6, 8, 10],
     "min_child_weight" : [1, 3, 5, 7],
     "gamma"            : [0.0, 0.1, 0.2 , 0.3, 0.4],
     "colsample_bytree" : [0.1, 0.3, 0.5, 0.7]
     }

xgb = XGBClassifier(objective="binary:logistic")
xgb_rs = RandomizedSearchCV(estimator=xgb, param_distributions=xgb_param_grid, n_jobs=-1, refit=True, verbose=3, cv=3)
xgb_rs_result = xgb_rs.fit(X_train, y_train)

print(f"Best score: {xgb_rs_result.best_score_} using {xgb_rs_result.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits




Best score: 0.8880562429124067 using {'min_child_weight': 1, 'max_depth': 8, 'gamma': 0.0, 'eta': 0.05, 'colsample_bytree': 0.3}


In [19]:
# Perform XGBoost
xgb = XGBClassifier(objective="binary:logistic", min_child_weight=1, max_depth=8, gamma=0.0,
                   eta=0.05, colsample_bytree=0.3)
xgb.fit(X_train, y_train)
print(f"XGBoost test results with the tuned hyperparameters:\n")
result(xgb, X_train, y_train, X_test, y_test, train=False)

XGBoost test results with the tuned hyperparameters:

Test Results:

Accuracy Score: 0.8682466870540265

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.32      0.41      1132
           1       0.89      0.96      0.93      6716

    accuracy                           0.87      7848
   macro avg       0.74      0.64      0.67      7848
weighted avg       0.85      0.87      0.85      7848


Confusion Matrix:
[[ 366  766]
 [ 268 6448]]

