In [228]:
# library for mathematical functions and data preprocessing such as table, etc
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import skew

# Library for visualizing data
import matplotlib.pyplot as plt
import seaborn as sns

# Library for machine learning functions/algorithms
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score,  precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


In [229]:
# data itself
url = 'lending_club_loan_two.csv'
# description of each column
url_info = 'lending_club_info.csv'
df_before = pd.read_csv(url)
df_info = pd.read_csv(url_info)
df_before.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000.0,36 months,11.44,329.48,B,B4,Marketing,10+ years,RENT,117000.0,...,16.0,0.0,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,"0174 Michelle Gateway\nMendozaberg, OK 22690"
1,8000.0,36 months,11.99,265.68,B,B5,Credit analyst,4 years,MORTGAGE,65000.0,...,17.0,0.0,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,"1076 Carney Fort Apt. 347\nLoganmouth, SD 05113"
2,15600.0,36 months,10.49,506.97,B,B3,Statistician,< 1 year,RENT,43057.0,...,13.0,0.0,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,"87025 Mark Dale Apt. 269\nNew Sabrina, WV 05113"
3,7200.0,36 months,6.49,220.65,A,A2,Client Advocate,6 years,RENT,54000.0,...,6.0,0.0,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,"823 Reid Ford\nDelacruzside, MA 00813"
4,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,...,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\nGreggshire, VA 11650"


In [230]:
df_before.isna().sum()

loan_amnt                   0
term                        0
int_rate                    0
installment                 0
grade                       0
sub_grade                   0
emp_title               22927
emp_length              18301
home_ownership              0
annual_inc                  0
verification_status         0
issue_d                     0
loan_status                 0
purpose                     0
title                    1756
dti                         0
earliest_cr_line            0
open_acc                    0
pub_rec                     0
revol_bal                   0
revol_util                276
total_acc                   0
initial_list_status         0
application_type            0
mort_acc                37795
pub_rec_bankruptcies      535
address                     0
dtype: int64

In [206]:
# label encoder and fit_transform is a function that changes/remap string into number based on alphabetical order
le = LabelEncoder() 
df_before['term'] = le.fit_transform(df_before['term'])
# 36 months = 0,  60 months = 1

df_before['emp_length'] = le.fit_transform(df_before['emp_length'])
# '1 year' = 0, '10+ years' = 1, '2 years' = 2, '3 years' = 3, '4 years' = 4, '5 years' = 5,   
# '6 years' = 6, '7 years' = 7, '8 years' = 8,  '9 years' = 9, '< 1 year' = 10

df_before['home_ownership'] = le.fit_transform(df_before['home_ownership'])
# 'ANY' = 0, 'MORTGAGE' = 1, 'NONE' = 2, 'OTHER' = 3, 'OWN' = 4, 'RENT' = 5

df_before['verification_status'] = le.fit_transform(df_before['verification_status'])
# 'Not Verified' = 0, 'Source Verified' = 1, 'Verified' = 2

df_before['application_type'] = le.fit_transform(df_before['application_type'])
# 'DIRECT_PAY' = 0, 'INDIVIDUAL' = 1, 'JOINT' = 2

df_before['grade'] = le.fit_transform(df_before['grade'])
df_before['sub_grade'] = le.fit_transform(df_before['sub_grade'])


In [207]:
df_before['emp_length'].fillna(df_before['emp_length'].median(), inplace=True)
df_before['pub_rec_bankruptcies'].fillna(df_before['pub_rec_bankruptcies'].median(), inplace=True)
df_before['mort_acc'].fillna(df_before['mort_acc'].median(), inplace=True)
df_before['revol_util'].fillna(df_before['revol_util'].mean(), inplace=True)

In [208]:
z_score_vars = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc', 'revol_bal', 'total_acc']

no_outlier = df_before.copy()

# Store outlier counts and percentages for each column
outlier_info = {}

for col in z_score_vars:
    # since our dataset is skewed we use the 99 percentile formula to detect and delete outliers, only identifying extreme datas
    lower_limit = df_before[col].quantile(0.01)
    upper_limit = df_before[col].quantile(0.99)

    # Count outliers before clipping
    outlier_count = ((df_before[col] < lower_limit) | (df_before[col] > upper_limit)).sum()
    total_count = len(df_before[col])
    outlier_percentage = outlier_count / total_count * 100

    # Store outlier information for the column
    outlier_info[col] = {'count': outlier_count, 'percentage': outlier_percentage}

# Print outlier information for each column
print("Outlier information:")
for col, info in outlier_info.items():
    print(f"- {col}: {info['count']} outliers ({info['percentage']:.2f}%)")

# Drop rows with outliers
for col in z_score_vars:
    lower_limit = df_before[col].quantile(0.01)
    upper_limit = df_before[col].quantile(0.99)
    no_outlier = no_outlier[(no_outlier[col] >= lower_limit) & (no_outlier[col] <= upper_limit)]

Outlier information:
- loan_amnt: 4089 outliers (1.03%)
- int_rate: 7823 outliers (1.98%)
- installment: 7916 outliers (2.00%)
- annual_inc: 7237 outliers (1.83%)
- dti: 7905 outliers (2.00%)
- open_acc: 5372 outliers (1.36%)
- revol_bal: 7918 outliers (2.00%)
- total_acc: 7553 outliers (1.91%)


## Random Forest Classification (Model Generation)

In [209]:
#  SPLIT
target_column = 'loan_status'
feature_columns = no_outlier.columns[no_outlier.columns != target_column]

X = no_outlier[feature_columns]
y = no_outlier[target_column]

In [196]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 351668 entries, 0 to 396029
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan_amnt             351668 non-null  float64
 1   term                  351668 non-null  int32  
 2   int_rate              351668 non-null  float64
 3   installment           351668 non-null  float64
 4   grade                 351668 non-null  object 
 5   sub_grade             351668 non-null  object 
 6   emp_length            351668 non-null  int32  
 7   home_ownership        351668 non-null  int32  
 8   annual_inc            351668 non-null  float64
 9   verification_status   351668 non-null  int32  
 10  dti                   351668 non-null  float64
 11  open_acc              351668 non-null  float64
 12  pub_rec               351668 non-null  float64
 13  revol_bal             351668 non-null  float64
 14  revol_util            351668 non-null  float64
 15  total

In [195]:

X_num = X.select_dtypes(include=['float64', 'int64'])  # Assuming numerical columns are of these types
X_cat = X.select_dtypes(include=['object'])  # Assuming categorical columns are of object type


In [193]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd

# Assuming X_num and X_cat are already defined

# Step 1: Split numerical data
X_num_train, X_num_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=42)

# Step 2: Apply SMOTE to the training set of numerical data
smote = SMOTE(random_state=42)
X_num_train_resampled, y_train_resampled = smote.fit_resample(X_num_train, y_train)

# Now X_num_train_resampled and y_train_resampled contain the resampled numerical training data

# Step 3: Split categorical data
X_cat_train, X_cat_test = train_test_split(X_cat, test_size=0.2, random_state=42)

# Step 4: No need to apply SMOTE to categorical data, as SMOTE is typically used for imbalance in target classes in classification problems

# Step 5: Concatenate the resampled numerical and categorical training data
X_train_resampled = pd.concat([X_num_train_resampled, X_cat_train], axis=1)

# Step 6: Optionally, you can concatenate the numerical and categorical test data
X_test = pd.concat([X_num_test, X_cat_test], axis=1)

# Now, X_train_resampled, y_train_resampled, and X_test can be used to train and evaluate your model


In [194]:
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

X_train_resampled, y_train_resampled = SMOTEENN(sampling_strategy='all', enn=EditedNearestNeighbours(sampling_strategy='all')).fit_resample(X_train, y_train)

ValueError: could not convert string to float: 'A'

In [211]:
data = y_train.value_counts()
print(data)

loan_status
Fully Paid     226747
Charged Off     54587
Name: count, dtype: int64


In [212]:
data_counts = y_train_resampled.value_counts()
print(data_counts)

loan_status
Fully Paid     226747
Charged Off    226747
Name: count, dtype: int64


In [214]:
# Create a cost-sensitive random forest classifier
clf = RandomForestClassifier(n_estimators=100, bootstrap=True, criterion='gini', max_depth=None,random_state=0)

# Train the classifier on the training set
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
y_pred_validation = clf.predict(X_validation)

# Evaluate the accuracy on the validation set
accuracy_validation = accuracy_score(y_validation, y_pred_validation)
print(f'Validation Accuracy: {accuracy_validation}')

# Make predictions on the test set
y_pred_test = clf.predict(X_test)

# Evaluate the accuracy on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f'Test Accuracy: {accuracy_test}')

Validation Accuracy: 0.7989308158216509
Test Accuracy: 0.7950066824011147


In [218]:
cm = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix\n", cm)

Confusion Matrix
 [[ 1168  5671]
 [ 1538 26790]]


In [219]:
target_names = ['Charged Off', 'Fully Paid']
print(classification_report(y_test, y_pred_test, target_names=target_names))

              precision    recall  f1-score   support

 Charged Off       0.43      0.17      0.24      6839
  Fully Paid       0.83      0.95      0.88     28328

    accuracy                           0.80     35167
   macro avg       0.63      0.56      0.56     35167
weighted avg       0.75      0.80      0.76     35167



In [225]:
TN, FP, FN, TP = cm.ravel()

accuracy = accuracy_score(y_test, y_pred_test)
print("Accuracy:", accuracy)

# pos_label = 'Charged Off'
pos_label = 'Fully Paid'
precision = precision_score(y_test, y_pred_test, pos_label=pos_label)
print("Precision:", precision)

recall = recall_score(y_test, y_pred_test, pos_label=pos_label)
print("Recall:", recall)

specificity = TN / (TN + FP)
print("Specificity:", specificity)

f1 = f1_score(y_test, y_pred_test, pos_label=pos_label)
print("F1 Score:", f1)

Accuracy: 0.7950066824011147
Precision: 0.8252980499676535
Recall: 0.9457074272804292
Specificity: 0.17078520251498758
F1 Score: 0.8814094655283029


In [226]:
# Assuming you have the predicted probabilities for the positive class
y_prob_test = clf.predict_proba(X_test)[:, 1]

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_prob_test)
print("AUC Score:", auc_score)

AUC Score: 0.6959810352886221


In [227]:
feature_importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train_resampled.columns,
    'Importance': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_importance_df

Unnamed: 0,Feature,Importance
10,dti,0.10462
13,revol_bal,0.095938
14,revol_util,0.09528
8,annual_inc,0.087489
2,int_rate,0.086814
3,installment,0.084998
15,total_acc,0.074443
0,loan_amnt,0.066391
11,open_acc,0.061087
5,sub_grade,0.060688
