In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [15]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [16]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [17]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv.zip')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


# Split the Data into Training and Testing

In [18]:
# Extract the column names in need of binary coding
objects = df.select_dtypes(include='object')
columns_for_binary = objects.columns.tolist()
columns_for_binary

['home_ownership',
 'verification_status',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'initial_list_status',
 'next_pymnt_d',
 'application_type',
 'hardship_flag',
 'debt_settlement_flag']

In [19]:
# Create our features
X = df.copy()
X.drop('loan_status',axis=1,inplace=True)
# Drop loan_status from the above list
columns_for_binary.remove('loan_status')
# Use update list to perform get_dummies properly
X = pd.get_dummies(X,columns=columns_for_binary)

In [20]:
# Create our target
y = df['loan_status']

In [21]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,issue_d_Mar-2019,pymnt_plan_n,initial_list_status_f,initial_list_status_w,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.177238,1.0,0.123879,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.381873,0.0,0.329446,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# Check the balance of our target values
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [23]:
# Split the X and y into X_train, X_test, y_train, y_test
# YOUR CODE HERE
from sklearn.model_selection import train_test_split

# Set random_state for consistence throughout the notebook
random_state = 15

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [24]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [25]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
# YOUR CODE HERE
X_scaler = scaler.fit(X_train)

In [27]:
# Scale the training and testing data
# YOUR CODE HERE
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [28]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
# Import the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Create an instance of the BalancedRandomForestClassifier model
brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=random_state)

# Fit the model using the X_train (inclusive of the scaled process) and y_train
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=1000, random_state=15)

In [29]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
# First, find the y_predictions using the model and X_test inclusive of the scaled process
y_predictions = brf.predict(X_test_scaled)

# Then, using the predictions and the y_test numbers, find the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_predictions)

0.7879414708632884

In [30]:
# Display the confusion matrix
# YOUR CODE HERE
from sklearn.metrics import confusion_matrix

# Create the confusion matrix
cm = confusion_matrix(y_test,y_predictions)

# Visualize the Confusion Matrix using a DataFrame
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,63,33
Actual 1,1375,15734


In [31]:
# Print the imbalanced classification report
# YOUR CODE HERE
# Import the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Display the imbalanced classification report
print('IMBALANCED CLASSIFICATION REPORT')
print(classification_report_imbalanced(y_test, y_predictions))

IMBALANCED CLASSIFICATION REPORT
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.04      0.66      0.92      0.08      0.78      0.59        96
   low_risk       1.00      0.92      0.66      0.96      0.78      0.62     17109

avg / total       0.99      0.92      0.66      0.95      0.78      0.62     17205



In [32]:
# List the features sorted in descending order by feature importance
# YOUR CODE HERE
# Get the feature importance array
#importances = brf.feature_importances_

#importances_sorted = sorted(zip(X.columns,importances),reverse=True)
#importances_sorted

importances = brf.feature_importances_
std = np.std([tree.feature_importances_ for tree in brf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature #%d %f" % (f + 1, indices[f], importances[indices[f]]))


Feature ranking:
1. feature #15 0.066222
2. feature #13 0.064078
3. feature #14 0.062188
4. feature #20 0.061398
5. feature #16 0.058884
6. feature #84 0.028612
7. feature #1 0.026453
8. feature #4 0.017999
9. feature #2 0.016680
10. feature #35 0.016210
11. feature #11 0.016051
12. feature #48 0.015941
13. feature #12 0.015467
14. feature #3 0.015140
15. feature #85 0.015105
16. feature #47 0.015104
17. feature #42 0.014615
18. feature #72 0.014324
19. feature #53 0.014207
20. feature #9 0.014010
21. feature #74 0.013522
22. feature #44 0.013448
23. feature #32 0.013323
24. feature #25 0.013089
25. feature #43 0.012543
26. feature #36 0.012491
27. feature #31 0.012480
28. feature #75 0.012451
29. feature #0 0.012106
30. feature #73 0.012054
31. feature #37 0.011917
32. feature #52 0.011657
33. feature #30 0.011623
34. feature #10 0.010893
35. feature #49 0.010679
36. feature #61 0.010384
37. feature #7 0.010269
38. feature #59 0.009793
39. feature #50 0.009307
40. feature #63 0.009139

In [41]:
importances_sorted = sorted(zip(importances,X.columns),reverse=True)
importances_sorted

[(0.0662224543205517, 'total_rec_prncp'),
 (0.06407791460970243, 'total_pymnt'),
 (0.062187671916102555, 'total_pymnt_inv'),
 (0.0613980310304278, 'last_pymnt_amnt'),
 (0.058884227548521656, 'total_rec_int'),
 (0.028611757058435973, 'issue_d_Jan-2019'),
 (0.026452683194868324, 'int_rate'),
 (0.017999172076260735, 'dti'),
 (0.016679717829860247, 'installment'),
 (0.016209609836032937, 'max_bal_bc'),
 (0.016050817869101962, 'out_prncp'),
 (0.01594101382909823, 'mo_sin_old_rev_tl_op'),
 (0.015466718134062602, 'out_prncp_inv'),
 (0.015140007457977455, 'annual_inc'),
 (0.01510492673440736, 'issue_d_Mar-2019'),
 (0.015103716334832746, 'mo_sin_old_il_acct'),
 (0.014614744900767637, 'avg_cur_bal'),
 (0.014324227180754321, 'tot_hi_cred_lim'),
 (0.014206815835784061, 'mths_since_recent_inq'),
 (0.014010143689134124, 'revol_bal'),
 (0.013521599217843393, 'total_bc_limit'),
 (0.01344799997542837, 'bc_util'),
 (0.013322905502074808, 'il_util'),
 (0.013089324999954986, 'tot_cur_bal'),
 (0.0125434182

### Easy Ensemble Classifier

In [33]:
# Train the EasyEnsembleClassifier
# YOUR CODE HERE
# Import the EasyEnsembleClassifier model
from imblearn.ensemble import EasyEnsembleClassifier

# Create an instance of the EasyEnsembleClassifier model
eec = EasyEnsembleClassifier(random_state=random_state)

# Train the EasyEnsembleClassifier model with training data
eec.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(random_state=15)

In [34]:
# Calculated the balanced accuracy score
# YOUR CODE HERE

# Find the y_predictions using the eec model above
y_predictions_1 = eec.predict(X_test_scaled)

# Return the balanced accuracy score
print('BALANCED ACCURACY SCORE')
balanced_accuracy_score(y_test, y_predictions_1)

BALANCED ACCURACY SCORE


0.9173415673037582

In [35]:
# Display the confusion matrix
# YOUR CODE HERE
# Create the confusion matrix
cm_1 = confusion_matrix(y_test,y_predictions_1)

# Visualize the Confusion Matrix using a DataFrame
cm_df_1 = pd.DataFrame(
    cm_1, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix")
display(cm_df_1)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,85,11
Actual 1,868,16241


In [36]:
# Print the imbalanced classification report
# YOUR CODE HERE
# Display the imbalanced classification report
print('IMBALANCED CLASSIFICATION REPORT')
print(classification_report_imbalanced(y_test, y_predictions))

IMBALANCED CLASSIFICATION REPORT
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.04      0.66      0.92      0.08      0.78      0.59        96
   low_risk       1.00      0.92      0.66      0.96      0.78      0.62     17109

avg / total       0.99      0.92      0.66      0.95      0.78      0.62     17205



# Conclusions

1) The model with the best balanced accuracy score is the Easy Ensemble Classifier. This model uses boosting to train weak learners sequentially by learning from the mistakes of previous learners. 

2) Both models, the Easy Ensemble Classifier and the Balanced Random Forest Classifier, had identical recall scores between them. This takes into account both high and low risk loan applicants. 

3) Both models have identical geometric mean scores.

4) 
total_rec_prncp,
total_pymnt, and 
total_pymnt_inv are the three most import features.