In [202]:
# library for mathematical functions and data preprocessing such as table, etc
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import skew

# Library for visualizing data
import matplotlib.pyplot as plt
import seaborn as sns

# Library for machine learning functions/algorithms
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score,  precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


In [203]:
# data itself
url = 'lending_club_loan_two.csv'
# description of each column
url_info = 'lending_club_info.csv'
df_before = pd.read_csv(url)
df_info = pd.read_csv(url_info)
df_before.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000.0,36 months,11.44,329.48,B,B4,Marketing,10+ years,RENT,117000.0,...,16.0,0.0,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,"0174 Michelle Gateway\nMendozaberg, OK 22690"
1,8000.0,36 months,11.99,265.68,B,B5,Credit analyst,4 years,MORTGAGE,65000.0,...,17.0,0.0,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,"1076 Carney Fort Apt. 347\nLoganmouth, SD 05113"
2,15600.0,36 months,10.49,506.97,B,B3,Statistician,< 1 year,RENT,43057.0,...,13.0,0.0,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,"87025 Mark Dale Apt. 269\nNew Sabrina, WV 05113"
3,7200.0,36 months,6.49,220.65,A,A2,Client Advocate,6 years,RENT,54000.0,...,6.0,0.0,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,"823 Reid Ford\nDelacruzside, MA 00813"
4,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,...,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\nGreggshire, VA 11650"


In [204]:
df_before.isna().sum()

loan_amnt                   0
term                        0
int_rate                    0
installment                 0
grade                       0
sub_grade                   0
emp_title               22927
emp_length              18301
home_ownership              0
annual_inc                  0
verification_status         0
issue_d                     0
loan_status                 0
purpose                     0
title                    1756
dti                         0
earliest_cr_line            0
open_acc                    0
pub_rec                     0
revol_bal                   0
revol_util                276
total_acc                   0
initial_list_status         0
application_type            0
mort_acc                37795
pub_rec_bankruptcies      535
address                     0
dtype: int64

In [205]:
# label encoder and fit_transform is a function that changes/remap string into number based on alphabetical order
le = LabelEncoder() 
df_before['term'] = le.fit_transform(df_before['term'])
# 36 months = 0,  60 months = 1

df_before['emp_length'] = le.fit_transform(df_before['emp_length'])
# '1 year' = 0, '10+ years' = 1, '2 years' = 2, '3 years' = 3, '4 years' = 4, '5 years' = 5,   
# '6 years' = 6, '7 years' = 7, '8 years' = 8,  '9 years' = 9, '< 1 year' = 10

df_before['home_ownership'] = le.fit_transform(df_before['home_ownership'])
# 'ANY' = 0, 'MORTGAGE' = 1, 'NONE' = 2, 'OTHER' = 3, 'OWN' = 4, 'RENT' = 5

df_before['verification_status'] = le.fit_transform(df_before['verification_status'])
# 'Not Verified' = 0, 'Source Verified' = 1, 'Verified' = 2

df_before['application_type'] = le.fit_transform(df_before['application_type'])
# 'DIRECT_PAY' = 0, 'INDIVIDUAL' = 1, 'JOINT' = 2

df_before['grade'] = le.fit_transform(df_before['grade'])
df_before['sub_grade'] = le.fit_transform(df_before['sub_grade'])

# Assuming df is your DataFrame with 'issue_d' and 'earliest_cr_line' columns
# Replace df with your actual DataFrame name

# Create a new DataFrame for 'issue_d'
df_before_issue = df_before['issue_d'].str.split('-', expand=True)
df_before_issue.columns = ['issue_month', 'issue_year']

# Create a new DataFrame for 'earliest_cr_line'
df_before_cr_line = df_before['earliest_cr_line'].str.split('-', expand=True)
df_before_cr_line.columns = ['cr_line_month', 'cr_line_year']

# Concatenate the new DataFrames with the original DataFrame
df_before = pd.concat([df_before, df_before_issue, df_before_cr_line], axis=1)

# Drop the original 'issue_d' and 'earliest_cr_line' columns if needed
# df_before = df_before.drop(['issue_d', 'earliest_cr_line'], axis=1)

# Display the resulting DataFrame
df_before['cr_line_month'] = le.fit_transform(df_before['cr_line_month'])
df_before['issue_month'] = le.fit_transform(df_before['issue_month'])
df_before['initial_list_status'] = le.fit_transform(df_before['initial_list_status'])
df_before['purpose'] = le.fit_transform(df_before['purpose'])
# Convert interest rate and revolving utilization to decimals
df_before['int_rate'] = df_before['int_rate'] / 100.0
df_before['revol_util'] = df_before['revol_util'] / 100.0


In [206]:
df_before['emp_length'].fillna(df_before['emp_length'].median(), inplace=True)
df_before['pub_rec_bankruptcies'].fillna(df_before['pub_rec_bankruptcies'].median(), inplace=True)
df_before['mort_acc'].fillna(df_before['mort_acc'].median(), inplace=True)
df_before['revol_util'].fillna(df_before['revol_util'].mean(), inplace=True)

In [207]:
z_score_vars = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc', 'revol_bal', 'total_acc']

no_outlier = df_before.copy()

# Store outlier counts and percentages for each column
outlier_info = {}

for col in z_score_vars:
    # Calculate lower and upper bounds using the 1st and 99th percentiles
    lower_limit = df_before[col].quantile(0.01)
    upper_limit = df_before[col].quantile(0.99)

    # Replace outliers with more reasonable values (clipping)
    no_outlier[col] = np.clip(df_before[col], lower_limit, upper_limit)

    # Count outliers before clipping
    outlier_count = ((df_before[col] < lower_limit) | (df_before[col] > upper_limit)).sum()
    total_count = len(df_before[col])
    outlier_percentage = outlier_count / total_count * 100

    # Store outlier information for the column
    outlier_info[col] = {'count': outlier_count, 'percentage': outlier_percentage}

# Print outlier information for each column
print("Outlier information:")
for col, info in outlier_info.items():
    print(f"- {col}: {info['count']} outliers ({info['percentage']:.2f}%)")


Outlier information:
- loan_amnt: 4089 outliers (1.03%)
- int_rate: 7823 outliers (1.98%)
- installment: 7916 outliers (2.00%)
- annual_inc: 7237 outliers (1.83%)
- dti: 7905 outliers (2.00%)
- open_acc: 5372 outliers (1.36%)
- revol_bal: 7918 outliers (2.00%)
- total_acc: 7553 outliers (1.91%)


In [208]:
# Reason:
# These features lack direct relevance to a person's ability to secure a loan. 
# grade and subgrade, summarizes the loan status, can be substituted with existing data like interest rates and annual income

columns_to_drop = ['emp_title', 'address', 'issue_d', 'title', 'earliest_cr_line','grade', 'sub_grade']
df_before = df_before.drop(columns=columns_to_drop)

In [209]:
df_before.isna().sum()

loan_amnt               0
term                    0
int_rate                0
installment             0
emp_length              0
home_ownership          0
annual_inc              0
verification_status     0
loan_status             0
purpose                 0
dti                     0
open_acc                0
pub_rec                 0
revol_bal               0
revol_util              0
total_acc               0
initial_list_status     0
application_type        0
mort_acc                0
pub_rec_bankruptcies    0
issue_month             0
issue_year              0
cr_line_month           0
cr_line_year            0
dtype: int64

In [210]:
# Check which column needs fixing
for column in df_before.columns:
    unique_values = df_before[column].unique()
    print(f"Unique values in '{column}':")
    print(unique_values)
    print()

Unique values in 'loan_amnt':
[10000.  8000. 15600. ... 36275. 36475.   725.]

Unique values in 'term':
[0 1]

Unique values in 'int_rate':
[0.1144 0.1199 0.1049 0.0649 0.1727 0.1333 0.0532 0.1114 0.1099 0.1629
 0.1311 0.1464 0.0917 0.1229 0.0662 0.0839 0.2198 0.079  0.0697 0.0699
 0.1561 0.1136 0.1335 0.1212 0.0999 0.0819 0.1875 0.0603 0.1499 0.1678
 0.1367 0.1398 0.1699 0.1991 0.1786 0.2149 0.1299 0.1854 0.0789 0.171
 0.1825 0.1167 0.0624 0.0818 0.1235 0.1416 0.1756 0.1855 0.2215 0.1039
 0.1599 0.1607 0.2499 0.0967 0.1919 0.21   0.1269 0.1074 0.0668 0.1922
 0.1149 0.1655 0.1997 0.247  0.1349 0.1824 0.1649 0.2578 0.2583 0.1864
 0.0751 0.1399 0.1522 0.1531 0.0769 0.1953 0.1016 0.0762 0.0975 0.1368
 0.1588 0.1465 0.0692 0.2383 0.1075 0.1849 0.2031 0.1757 0.2731 0.1999
 0.2299 0.1259 0.1037 0.1433 0.1353 0.2245 0.245  0.1799 0.0916 0.1249
 0.1155 0.1776 0.2899 0.231  0.2049 0.227  0.1015 0.0689 0.1952 0.089
 0.143  0.0949 0.2599 0.2408 0.1305 0.1498 0.1659 0.1126 0.2589 0.1448
 0.2199 0.

## Random Forest Classification (Model Generation)

In [211]:
#  SPLIT
target_column = 'loan_status'
feature_columns = df_before.columns[df_before.columns != target_column]

X = df_before[feature_columns]
y = df_before[target_column]

In [212]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [213]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
# Random Undersampling

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_under, y_under = undersampler.fit_resample(X_train, y_train)

oversampler = RandomOverSampler(sampling_strategy=1, random_state=42)
X_over, y_over = oversampler.fit_resample(X_under, y_under)



In [214]:
data_counts = y_train.value_counts()
print(data_counts)

loan_status
Fully Paid     254546
Charged Off     62278
Name: count, dtype: int64


In [215]:
data = y_over.value_counts()
print(data)

loan_status
Charged Off    124556
Fully Paid     124556
Name: count, dtype: int64


In [223]:
# Create a cost-sensitive random forest classifier
clf = RandomForestClassifier(n_estimators=83, bootstrap=True, criterion='gini', max_depth=None,random_state=0)

# Train the classifier on the training set
clf.fit(X_over, y_over)


# Make predictions on the test set
y_pred_test = clf.predict(X_test)

# Evaluate the accuracy on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f'Test Accuracy: {accuracy_test}')

Test Accuracy: 0.7536297755220565


In [226]:
cm = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix\n", cm)

Confusion Matrix
 [[ 6714  8681]
 [10833 52978]]


In [227]:
target_names = ['Charged Off', 'Fully Paid']
print(classification_report(y_test, y_pred_test, target_names=target_names))

              precision    recall  f1-score   support

 Charged Off       0.38      0.44      0.41     15395
  Fully Paid       0.86      0.83      0.84     63811

    accuracy                           0.75     79206
   macro avg       0.62      0.63      0.63     79206
weighted avg       0.77      0.75      0.76     79206



In [219]:
TN, FP, FN, TP = cm.ravel()

accuracy = accuracy_score(y_test, y_pred_test)
print("Accuracy:", accuracy)

# pos_label = 'Charged Off'
pos_label = 'Fully Paid'
precision = precision_score(y_test, y_pred_test, pos_label=pos_label)
print("Precision:", precision)

recall = recall_score(y_test, y_pred_test, pos_label=pos_label)
print("Recall:", recall)

specificity = TN / (TN + FP)
print("Specificity:", specificity)

f1 = f1_score(y_test, y_pred_test, pos_label=pos_label)
print("F1 Score:", f1)

Accuracy: 0.7514455975557407
Precision: 0.8605019771887971
Recall: 0.8252652364012474
Specificity: 0.4454693082169536
F1 Score: 0.8425153388955995


In [220]:
# Assuming you have the predicted probabilities for the positive class
y_prob_test = clf.predict_proba(X_test)[:, 1]

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_prob_test)
print("AUC Score:", auc_score)

AUC Score: 0.7159246480511381


In [221]:
feature_importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_over.columns,
    'Importance': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_importance_df

Unnamed: 0,Feature,Importance
2,int_rate,0.119435
9,dti,0.085017
13,revol_util,0.07527
12,revol_bal,0.073645
6,annual_inc,0.071664
3,installment,0.069121
14,total_acc,0.058517
0,loan_amnt,0.055118
22,cr_line_year,0.054085
10,open_acc,0.04891
