# Ensemble Learning

## Initial Imports

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import matplotlib.pyplot as plt
import pandas as pd

## Read the CSV and Perform Basic Data Cleaning

In [5]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df_LoanStats = pd.read_csv(file_path)

# Preview the data
df_LoanStats.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


## Split the Data into Training and Testing

In [6]:

# Create our features
#dropping loan status from our feature
X = df_LoanStats.drop(columns="loan_status")




In [7]:
# Create our target
#adding loan status to our target

y = df_LoanStats["loan_status"]

In [8]:
#The pandas function pd.get_dummies() allows you to transform your categorical into dummy indicator columns. 
#otherewise we will get an error when training our data because the category is an object
X = pd.get_dummies(X)

In [9]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,issue_d_Mar-2019,pymnt_plan_n,initial_list_status_f,initial_list_status_w,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.177238,1.0,0.123879,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.381873,0.0,0.329446,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# Check the balance of our target values
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
# Split the X and y into X_train, X_test, y_train, y_test
#We split this into two different datasets, one for the independent features — x, and one for the dependent 
# variable — y (which is the last column). 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train, X_test, y_train, y_test)


       loan_amnt  int_rate  installment  annual_inc    dti  delinq_2yrs  \
3903      1600.0    0.0881        50.74     35964.0  24.12          0.0   
28390     9000.0    0.0756       280.21     41000.0  16.89          1.0   
15470    10000.0    0.1033       214.10    112000.0  17.75          0.0   
4279     36000.0    0.1033      1167.21    120000.0  19.95          0.0   
57514    18000.0    0.0881       570.81     51000.0  19.11          1.0   
...          ...       ...          ...         ...    ...          ...   
49100     8400.0    0.0646       257.30     37992.0  33.16          0.0   
20609    30000.0    0.0646       918.93    217000.0  19.56          0.0   
21440    20000.0    0.1180       442.88     35000.0  16.19          0.0   
50057    11625.0    0.1447       273.34     30000.0  18.96          0.0   
5192     20000.0    0.1033       428.20     60000.0  59.56          0.0   

       inq_last_6mths  open_acc  pub_rec  revol_bal  ...  issue_d_Mar-2019  \
3903              0.0

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [12]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [13]:
# Fit the Standard Scaler with the training data

# When fitting scaling functions, only train on the training dataset

#Data contains features of various dimensions different scales of the data 
#features affect the modeling of a dataset adversely.It leads to a biased outcome of predictions in terms of misclassification
#error and accuracy rates. Thus, it is necessary to Scale the data prior to modeling.

X_scaler = scaler.fit(X_train)

In [14]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

## Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [15]:
from imblearn.ensemble import BalancedRandomForestClassifier
#  Resample the training data with the BalancedRandomForestClassifier
balanced_rf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
balanced_rf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [16]:
# Calculated the balanced accuracy score
y_pred_balanced_rf = balanced_rf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred_balanced_rf)

0.7887512850910909

In [17]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_balanced_rf)

array([[   71,    30],
       [ 2146, 14958]], dtype=int64)

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_balanced_rf, digits=3))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk      0.032     0.703     0.875     0.061     0.784     0.604       101
   low_risk      0.998     0.875     0.703     0.932     0.784     0.625     17104

avg / total      0.992     0.874     0.704     0.927     0.784     0.625     17205



In [19]:
# List the features sorted in descending order by feature importance
importance = balanced_rf.feature_importances_ 

importance_df = pd.DataFrame(importance, index = X_train.columns, columns = ['importance'])
importance_df = importance_df.sort_values(by = 'importance', ascending=False)

importance_df.head(20)

Unnamed: 0,importance
total_rec_prncp,0.078768
total_pymnt,0.058838
total_pymnt_inv,0.056256
total_rec_int,0.053555
last_pymnt_amnt,0.050033
int_rate,0.02967
issue_d_Jan-2019,0.021129
installment,0.019802
dti,0.017471
out_prncp_inv,0.016858


### Easy Ensemble Classifier

In [20]:
from imblearn.ensemble import EasyEnsembleClassifier

In [21]:
# Train the Classifier
ez = EasyEnsembleClassifier(n_estimators=100,random_state=1)
ez.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [22]:
# Calculated the balanced accuracy score
y_pred_ez = ez.predict(X_test_scaled) 
balanced_accuracy_score(y_test, y_pred_ez)

0.931601605553446

In [23]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_ez)

array([[   93,     8],
       [  985, 16119]], dtype=int64)

In [24]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_ez, digits=3))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk      0.086     0.921     0.942     0.158     0.932     0.866       101
   low_risk      1.000     0.942     0.921     0.970     0.932     0.870     17104

avg / total      0.994     0.942     0.921     0.965     0.932     0.870     17205



: 

### Final Questions

1. Which model had the best balanced accuracy score?

    The Easy Ensemble Classifier model was the most accurate with a score of .93

2. Which model had the best recall score?

   The best recall was the EasyEnsembleClassifier as well with a .94

3. Which model had the best geometric mean score?

 The best geometric mean was the EasyEnsembleClassifier as well with a .93

4. What are the top three features?

total_rec_prncp	0.078768
total_pymnt	0.058838
total_pymnt_inv	0.056256