In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Load Dataframe
data = Path('Loan_Predictions/Loans_Data_ML.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,Loan_Amt,Loan_Term,Monthly_Payment,Credit_Grade,Employement_Length,Annual_Income,DTI_Ratio,Application_Type,Joint_Annual_Income,Joint_DTI_Ratio,...,Delinquent_Amount,#_Mortgage_Accounts,#_Bankruptcies,#_Tax_Liens,Balance_Exc_Mortgage,Joint_Revolving_Balance,Open_Acc_IL24m,Open_RevAcc_24m,Acc_Curr_PD30days,Loan_Status
0,3600,36,111.97,A4,10+ years,120000.0,18.9,INDIVIDUAL,0.0,0.0,...,0,0,1,0,36506,0.0,14,11,0.0,Issued
1,15000,60,356.78,C4,10+ years,125000.0,17.25,INDIVIDUAL,0.0,0.0,...,0,2,0,0,69364,0.0,3,3,0.0,Issued
2,8400,36,276.56,B3,8 years,50000.0,15.63,INDIVIDUAL,0.0,0.0,...,0,4,0,0,51591,0.0,5,12,0.0,Issued
3,4000,36,130.0,B2,2 years,50000.0,33.61,INDIVIDUAL,0.0,0.0,...,0,6,0,0,136208,0.0,7,4,0.0,Issued
4,6000,36,185.93,A3,3 years,125000.0,9.25,INDIVIDUAL,0.0,0.0,...,0,0,0,0,60622,0.0,15,3,0.0,Current


In [4]:
# Remove the `Issued` loan status
issued_mask = df['Loan_Status'] != 'Issued'
df = df.loc[issued_mask]
issued_mask = df['Loan_Status'] != 'Fully Paid'
df = df.loc[issued_mask]

# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,Loan_Amt,Loan_Term,Monthly_Payment,Credit_Grade,Employement_Length,Annual_Income,DTI_Ratio,Application_Type,Joint_Annual_Income,Joint_DTI_Ratio,...,Delinquent_Amount,#_Mortgage_Accounts,#_Bankruptcies,#_Tax_Liens,Balance_Exc_Mortgage,Joint_Revolving_Balance,Open_Acc_IL24m,Open_RevAcc_24m,Acc_Curr_PD30days,Loan_Status
0,6000,36,185.93,A3,3 years,125000.0,9.25,INDIVIDUAL,0.0,0.0,...,0,0,0,0,60622,0.0,15,3,0.0,low_risk
1,12000,36,370.48,A2,10+ years,56000.0,17.9,INDIVIDUAL,0.0,0.0,...,0,5,0,0,21540,0.0,5,0,0.0,low_risk
2,10500,60,345.15,G2,< 1 year,45000.0,17.25,INDIVIDUAL,0.0,0.0,...,0,0,1,0,22148,0.0,7,13,0.0,low_risk
3,18000,36,542.07,A1,< 1 year,155000.0,9.08,INDIVIDUAL,0.0,0.0,...,0,0,0,0,61257,0.0,4,1,0.0,low_risk
4,1600,36,53.72,C1,1 year,60000.0,12.62,INDIVIDUAL,0.0,0.0,...,0,0,0,0,27231,0.0,6,10,0.0,low_risk


In [5]:
# Create our features

X = df.drop(columns='Loan_Status', axis=1)
X = pd.get_dummies(X)

# Create our target
y = df["Loan_Status"]

In [6]:
# Define the target set.
y = df["Loan_Status"].values
y[:5]


array(['low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk'],
      dtype=object)

In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train)

       Loan_Amt  Loan_Term  Monthly_Payment  Annual_Income  DTI_Ratio  \
26619     10000         60           219.38        60000.0      21.28   
19327     15000         36           494.22        60000.0       6.40   
63470      2000         36            62.90       100000.0      12.08   
67923      4000         36           130.00        43000.0      16.80   
8976      40000         36          1244.07       145000.0      22.53   
...         ...        ...              ...            ...        ...   
69240     21650         60           561.50        49218.0      13.29   
52748     15000         36           557.38        70000.0      17.01   
3476      17000         60           476.71        72000.0      29.10   
37405     34000         60           826.64       140000.0      16.83   
16854     20000         60           575.25       122000.0       8.14   

       Joint_Annual_Income  Joint_DTI_Ratio  #_Open_Accounts  \
26619                  0.0              0.0               1

In [8]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(59592, 72)
(19865, 72)
(59592,)
(19865,)


In [9]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)


In [10]:
# Determine the shape of our training and testing sets.
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(63565, 72)
(15892, 72)
(63565,)
(15892,)


In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)


In [13]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)


In [14]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm)

cm_df

Unnamed: 0,0,1
0,1,141
1,173,19550


In [15]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)


In [16]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,0,1
0,1,141
1,173,19550


Accuracy Score : 0.9841933048074503
Classification Report
              precision    recall  f1-score   support

   high_risk       0.01      0.01      0.01       142
    low_risk       0.99      0.99      0.99     19723

    accuracy                           0.98     19865
   macro avg       0.50      0.50      0.50     19865
weighted avg       0.99      0.98      0.98     19865



In [17]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
brfc.fit(X_train, y_train)

In [18]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5659014677223204

In [19]:
# List the features sorted in descending order by feature importance
feature_importances = brfc.feature_importances_
features = sorted(zip(feature_importances, X.columns), reverse=True)
for feature in features:
    print(f'{feature[1]}:  {feature[0]*100:.1f}%')


DTI_Ratio:  7.8%
Monthly_Payment:  7.4%
Balance_Exc_Mortgage:  7.4%
Total_Current_Balance_All:  7.3%
Annual_Income:  7.0%
Average_Current_Balance:  6.8%
Loan_Amt:  6.1%
#_Open_Accounts:  5.6%
Open_Acc_IL24m:  5.4%
Open_RevAcc_24m:  5.3%
#_Mortgage_Accounts:  3.3%
Total_Amount_in_Collection:  2.2%
Employement_Length_10+ years:  1.2%
Loan_Term:  1.1%
#_Bankruptcies:  1.1%
Employement_Length_2 years:  1.0%
Credit_Grade_D3:  1.0%
Acc_Curr_PD30days:  0.9%
Employement_Length_3 years:  0.9%
Credit_Grade_C1:  0.8%
#_Tax_Liens:  0.8%
Employement_Length_< 1 year:  0.8%
Credit_Grade_C3:  0.8%
Credit_Grade_C2:  0.8%
Credit_Grade_C5:  0.8%
Credit_Grade_B5:  0.7%
Credit_Grade_D2:  0.7%
Credit_Grade_C4:  0.7%
Employement_Length_5 years:  0.7%
Credit_Grade_B1:  0.6%
Credit_Grade_B4:  0.6%
Employement_Length_1 year:  0.6%
Credit_Grade_A1:  0.6%
Employement_Length_4 years:  0.6%
Employement_Length_None:  0.5%
Credit_Grade_E4:  0.5%
Joint_Annual_Income:  0.5%
Credit_Grade_D1:  0.5%
Credit_Grade_F1:  0.5%