## Lending Club Loans: Machine Learning 

#### Importing the packages and data



In [1]:
import pandas as pd
import pandas_profiling
import numpy as np

In [2]:
lc_loans = pd.read_csv("python_ml_data/ml_data.csv")
lc_loans.head()

Unnamed: 0,loan_status,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,desc,purpose,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,grade
0,Fully Paid,5000,36 months,10.65,162.87,10 +,RENT,24000.0,Verified,0,1,credit_card,27.65,0,737,B
1,Charged Off,2500,60 months,15.27,59.83,More than 1,RENT,30000.0,Source Verified,0,1,car,1.0,0,742,C
2,Fully Paid,2400,36 months,15.96,84.33,10 +,RENT,12252.0,Not Verified,0,0,small_business,8.72,0,737,C
3,Fully Paid,10000,36 months,13.49,339.31,10 +,RENT,49200.0,Source Verified,0,1,other,20.0,0,692,C
4,Fully Paid,5000,36 months,7.9,156.46,3,RENT,36000.0,Source Verified,0,0,wedding,11.2,0,732,A


In [3]:
pandas_profiling.ProfileReport(lc_loans)

TypeError: concat() got an unexpected keyword argument 'join_axes'

### Data Cleaning

In [4]:
# Data has been cleaning in R
lc_loans.head()

Unnamed: 0,loan_status,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,desc,purpose,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,grade
0,Fully Paid,5000,36 months,10.65,162.87,10 +,RENT,24000.0,Verified,0,1,credit_card,27.65,0,737,B
1,Charged Off,2500,60 months,15.27,59.83,More than 1,RENT,30000.0,Source Verified,0,1,car,1.0,0,742,C
2,Fully Paid,2400,36 months,15.96,84.33,10 +,RENT,12252.0,Not Verified,0,0,small_business,8.72,0,737,C
3,Fully Paid,10000,36 months,13.49,339.31,10 +,RENT,49200.0,Source Verified,0,1,other,20.0,0,692,C
4,Fully Paid,5000,36 months,7.9,156.46,3,RENT,36000.0,Source Verified,0,0,wedding,11.2,0,732,A


In [5]:
# No NA values remain in the data set
lc_loans.isna().sum()

loan_status             0
loan_amnt               0
term                    0
int_rate                0
installment             0
emp_length              0
home_ownership          0
annual_inc              0
verification_status     0
pymnt_plan              0
desc                    0
purpose                 0
debt_to_income_ratio    0
derog_pub_rec           0
mean_fico_scores        0
grade                   0
dtype: int64

In [6]:
# SciKit needs a numeric value, so convert response variable to numeric
lc_loans["paid"] = (
    np.where(lc_loans["loan_status"] == "Fully Paid", 1, 0)
)
lc_loans.head()

Unnamed: 0,loan_status,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,desc,purpose,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,grade,paid
0,Fully Paid,5000,36 months,10.65,162.87,10 +,RENT,24000.0,Verified,0,1,credit_card,27.65,0,737,B,1
1,Charged Off,2500,60 months,15.27,59.83,More than 1,RENT,30000.0,Source Verified,0,1,car,1.0,0,742,C,0
2,Fully Paid,2400,36 months,15.96,84.33,10 +,RENT,12252.0,Not Verified,0,0,small_business,8.72,0,737,C,1
3,Fully Paid,10000,36 months,13.49,339.31,10 +,RENT,49200.0,Source Verified,0,1,other,20.0,0,692,C,1
4,Fully Paid,5000,36 months,7.9,156.46,3,RENT,36000.0,Source Verified,0,0,wedding,11.2,0,732,A,1


In [7]:
# Drop the loan_status column
lc_loans.drop(columns = "loan_status", inplace = True)

lc_loans.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,desc,purpose,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,grade,paid
0,5000,36 months,10.65,162.87,10 +,RENT,24000.0,Verified,0,1,credit_card,27.65,0,737,B,1
1,2500,60 months,15.27,59.83,More than 1,RENT,30000.0,Source Verified,0,1,car,1.0,0,742,C,0
2,2400,36 months,15.96,84.33,10 +,RENT,12252.0,Not Verified,0,0,small_business,8.72,0,737,C,1
3,10000,36 months,13.49,339.31,10 +,RENT,49200.0,Source Verified,0,1,other,20.0,0,692,C,1
4,5000,36 months,7.9,156.46,3,RENT,36000.0,Source Verified,0,0,wedding,11.2,0,732,A,1


In [8]:
# Get Dummy Variables 
lc_loans = pd.get_dummies(lc_loans, drop_first = True)

lc_loans.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,pymnt_plan,desc,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,paid,...,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G
0,5000,10.65,162.87,24000.0,0,1,27.65,0,737,1,...,0,0,0,0,1,0,0,0,0,0
1,2500,15.27,59.83,30000.0,0,1,1.0,0,742,0,...,0,0,0,0,0,1,0,0,0,0
2,2400,15.96,84.33,12252.0,0,0,8.72,0,737,1,...,0,1,0,0,0,1,0,0,0,0
3,10000,13.49,339.31,49200.0,0,1,20.0,0,692,1,...,0,0,0,0,0,1,0,0,0,0
4,5000,7.9,156.46,36000.0,0,0,11.2,0,732,1,...,0,0,0,1,0,0,0,0,0,0


In [9]:
# Split the data into predictors and target variables
loans_predictors = lc_loans.drop(columns = "paid")
loans_response = lc_loans["paid"]

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
# Syntactic Sugar from Python
loans_pred_train, loans_pred_test, loans_resp_train, loans_resp_test = (
    train_test_split(
        loans_predictors,
        loans_response,
        test_size = 0.1,
        random_state = 7
    )
)

# X_train = loans_pred_train,
# X_test = loans_pred_test,
# y_train = loans_resp_train
# y_test = loans_resp_train

In [None]:
loans_response

In [12]:
# K-fold cross validated logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

model = LogisticRegression()
model.fit(loans_pred_train, loans_resp_train)
scores = cross_val_score(
    model, loans_pred_train, loans_resp_train, scoring='accuracy', cv=10
)
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.85645527, 0.85532276, 0.85503964, 0.85702152, 0.85447339,
       0.85754744, 0.85726423, 0.85698103, 0.85669782, 0.85669782])

In [13]:
print(model.score(loans_pred_train, loans_resp_train))
print(np.mean(scores))

0.8564066260795696
0.8563500902698731


In [14]:
# Getting the Sensitivity and Specificity

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

predictions = model.predict(loans_pred_test)

cm = metrics.confusion_matrix(loans_resp_train, predictions)
print(cm)

# X_train = loans_pred_train,
# X_test = loans_pred_test,
# y_train = loans_resp_train
# y_test = loans_resp_train

ValueError: Found input variables with inconsistent numbers of samples: [35315, 3924]

In [15]:
model.score(loans_pred_test, loans_resp_test)

0.8478593272171254

In [16]:
pred_test = model.predict_proba(loans_pred_test)
pred_test

array([[0.18295406, 0.81704594],
       [0.11582387, 0.88417613],
       [0.14709556, 0.85290444],
       ...,
       [0.07410765, 0.92589235],
       [0.07173944, 0.92826056],
       [0.10860033, 0.89139967]])

In [17]:
pred_test = pred_test[:, 1]
pred_test

array([0.81704594, 0.88417613, 0.85290444, ..., 0.92589235, 0.92826056,
       0.89139967])

In [18]:
# AUC Score

from sklearn.metrics import roc_auc_score

roc_auc_score(y_true = loans_resp_test, y_score = pred_test)

0.6806767028207866

* AUC score is not all that great for this model, has a reasonably low sensitivity 
* Accuracy of 85% isn't that good when the data is already 85% in one response



# Random Forests

In [19]:
from sklearn.ensemble import RandomForestClassifier

# run the model with 100 trees
model = RandomForestClassifier(n_estimators = 100)

# train the model on the training data
model.fit(loans_pred_train, loans_resp_train)

RandomForestClassifier()

In [21]:
scores = cross_val_score(
    model, loans_pred_train, loans_resp_train, scoring='accuracy', cv=10
)
np.mean(scores)

0.8552456674790795

Great score, but still 85% of our results are in one category

In [22]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# create a list with names of predictors, and an array with target categories for plotting
loans_pred_names = list(loans_pred_test)
loans_resp_names = ['0', '1']

# set the figure size
fig = plt.figure(figsize=(15, 10))

# plot the decision tree
plot_tree(model.estimators_[0], # Pull out one tree from the forest
          max_depth = 2, # will freeze if we try plot all of it!
          feature_names = loans_pred_names,
          class_names = loans_resp_names, 
          filled=True);

In [23]:
# Get numerical feature importances
importances = list(model.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(loans_pred_names, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:40} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: annual_inc                               Importance: 0.13
Variable: debt_to_income_ratio                     Importance: 0.13
Variable: int_rate                                 Importance: 0.12
Variable: installment                              Importance: 0.12
Variable: loan_amnt                                Importance: 0.1
Variable: mean_fico_scores                         Importance: 0.09
Variable: desc                                     Importance: 0.02
Variable: term_60 months                           Importance: 0.02
Variable: home_ownership_RENT                      Importance: 0.02
Variable: verification_status_Source Verified      Importance: 0.02
Variable: verification_status_Verified             Importance: 0.02
Variable: purpose_debt_consolidation               Importance: 0.02
Variable: derog_pub_rec                            Importance: 0.01
Variable: emp_length_2                             Importance: 0.01
Variable: emp_length_3                           