## Lending Club Loans: Machine Learning 

#### Importing the packages and data



In [1]:
import pandas as pd
import pandas_profiling
import numpy as np

In [4]:
lc_loans = pd.read_csv("python_ml_data/ml_data.csv")
lc_loans.head()

Unnamed: 0,loan_status,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,desc,purpose,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,grade
0,Fully Paid,5000,36 months,10.65,162.87,10 +,RENT,24000.0,Verified,0,1,credit_card,27.65,0,737,B
1,Charged Off,2500,60 months,15.27,59.83,More than 1,RENT,30000.0,Source Verified,0,1,car,1.0,0,742,C
2,Fully Paid,2400,36 months,15.96,84.33,10 +,RENT,12252.0,Not Verified,0,0,small_business,8.72,0,737,C
3,Fully Paid,10000,36 months,13.49,339.31,10 +,RENT,49200.0,Source Verified,0,1,other,20.0,0,692,C
4,Fully Paid,5000,36 months,7.9,156.46,3,RENT,36000.0,Source Verified,0,0,wedding,11.2,0,732,A


In [14]:
pandas_profiling.ProfileReport(lc_loans)

TypeError: concat() got an unexpected keyword argument 'join_axes'

### Data Cleaning

In [7]:
# Data has been cleaning in R
lc_loans.head()

Unnamed: 0,loan_status,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,desc,purpose,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,grade
0,Fully Paid,5000,36 months,10.65,162.87,10 +,RENT,24000.0,Verified,0,1,credit_card,27.65,0,737,B
1,Charged Off,2500,60 months,15.27,59.83,More than 1,RENT,30000.0,Source Verified,0,1,car,1.0,0,742,C
2,Fully Paid,2400,36 months,15.96,84.33,10 +,RENT,12252.0,Not Verified,0,0,small_business,8.72,0,737,C
3,Fully Paid,10000,36 months,13.49,339.31,10 +,RENT,49200.0,Source Verified,0,1,other,20.0,0,692,C
4,Fully Paid,5000,36 months,7.9,156.46,3,RENT,36000.0,Source Verified,0,0,wedding,11.2,0,732,A


In [8]:
# No NA values remain in the data set
lc_loans.isna().sum()

loan_status             0
loan_amnt               0
term                    0
int_rate                0
installment             0
emp_length              0
home_ownership          0
annual_inc              0
verification_status     0
pymnt_plan              0
desc                    0
purpose                 0
debt_to_income_ratio    0
derog_pub_rec           0
mean_fico_scores        0
grade                   0
dtype: int64

In [10]:
# SciKit needs a numeric value, so convert response variable to numeric
lc_loans["paid"] = (
    np.where(lc_loans["loan_status"] == "Fully Paid", 1, 0)
)
lc_loans.head()

Unnamed: 0,loan_status,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,desc,purpose,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,grade,paid
0,Fully Paid,5000,36 months,10.65,162.87,10 +,RENT,24000.0,Verified,0,1,credit_card,27.65,0,737,B,1
1,Charged Off,2500,60 months,15.27,59.83,More than 1,RENT,30000.0,Source Verified,0,1,car,1.0,0,742,C,0
2,Fully Paid,2400,36 months,15.96,84.33,10 +,RENT,12252.0,Not Verified,0,0,small_business,8.72,0,737,C,1
3,Fully Paid,10000,36 months,13.49,339.31,10 +,RENT,49200.0,Source Verified,0,1,other,20.0,0,692,C,1
4,Fully Paid,5000,36 months,7.9,156.46,3,RENT,36000.0,Source Verified,0,0,wedding,11.2,0,732,A,1


In [11]:
# Drop the loan_status column
lc_loans.drop(columns = "loan_status", inplace = True)

lc_loans.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,desc,purpose,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,grade,paid
0,5000,36 months,10.65,162.87,10 +,RENT,24000.0,Verified,0,1,credit_card,27.65,0,737,B,1
1,2500,60 months,15.27,59.83,More than 1,RENT,30000.0,Source Verified,0,1,car,1.0,0,742,C,0
2,2400,36 months,15.96,84.33,10 +,RENT,12252.0,Not Verified,0,0,small_business,8.72,0,737,C,1
3,10000,36 months,13.49,339.31,10 +,RENT,49200.0,Source Verified,0,1,other,20.0,0,692,C,1
4,5000,36 months,7.9,156.46,3,RENT,36000.0,Source Verified,0,0,wedding,11.2,0,732,A,1


In [12]:
# Get Dummy Variables 
lc_loans = pd.get_dummies(lc_loans, drop_first = True)

lc_loans.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,pymnt_plan,desc,debt_to_income_ratio,derog_pub_rec,mean_fico_scores,paid,...,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G
0,5000,10.65,162.87,24000.0,0,1,27.65,0,737,1,...,0,0,0,0,1,0,0,0,0,0
1,2500,15.27,59.83,30000.0,0,1,1.0,0,742,0,...,0,0,0,0,0,1,0,0,0,0
2,2400,15.96,84.33,12252.0,0,0,8.72,0,737,1,...,0,1,0,0,0,1,0,0,0,0
3,10000,13.49,339.31,49200.0,0,1,20.0,0,692,1,...,0,0,0,0,0,1,0,0,0,0
4,5000,7.9,156.46,36000.0,0,0,11.2,0,732,1,...,0,0,0,1,0,0,0,0,0,0


In [13]:
# Split the data into predictors and target variables
loans_predictors = lc_loans.drop(columns = "paid")
loans_response = lc_loans["paid"]