### Welcome to our Credit Default Risk Prediction Model
#### Team Members: Blythe Berlinger, Luke Brothers, Annie Peak, and Nathaniel Yee
#### Date: November 14th, 2024
#### Group Number:
#### Professor Yang

In [1]:
import scipy
import statsmodels as sm
import sklearn as sk
import numpy as np
import altair as alt
import seaborn as sns
import plotly.express as px
import pandas as pd
from scipy.stats import dunnett

In [2]:
# Load in the data using pandas
cdr_file = 'credit_risk_dataset.csv'
cdr_data = pd.read_csv(cdr_file)
# Remove data points that are missing values
cdr_data_clean = cdr_data.dropna()
cdr_data_clean.sort_values('loan_grade',ascending=True)

cdr_data_clean

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [3]:
cdr_data_clean.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0
mean,27.727216,66649.37,4.788672,9656.493121,11.039867,0.2166,0.169488,5.793736
std,6.310441,62356.45,4.154627,6329.683361,3.229372,0.411935,0.106393,4.038483
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39480.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55956.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.48,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [4]:
cdr_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28638 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  28638 non-null  int64  
 1   person_income               28638 non-null  int64  
 2   person_home_ownership       28638 non-null  object 
 3   person_emp_length           28638 non-null  float64
 4   loan_intent                 28638 non-null  object 
 5   loan_grade                  28638 non-null  object 
 6   loan_amnt                   28638 non-null  int64  
 7   loan_int_rate               28638 non-null  float64
 8   loan_status                 28638 non-null  int64  
 9   loan_percent_income         28638 non-null  float64
 10  cb_person_default_on_file   28638 non-null  object 
 11  cb_person_cred_hist_length  28638 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 2.8+ MB


In [5]:
cdr_data_clean.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [6]:
cdr_data_clean.nunique()

person_age                      57
person_income                 3835
person_home_ownership            4
person_emp_length               36
loan_intent                      6
loan_grade                       7
loan_amnt                      728
loan_int_rate                  348
loan_status                      2
loan_percent_income             77
cb_person_default_on_file        2
cb_person_cred_hist_length      29
dtype: int64

In [28]:
# Visualize The Types of Loan Grades by Count
cdr_data_clean_loan = cdr_data_clean['loan_grade'].value_counts().reset_index()
print(cdr_data_clean_loan)
fig = px.pie(
    cdr_data_clean_loan,
    values='count',
    names='loan_grade',
    title = 'Loan Grade Amounts',
    color_discrete_sequence=px.colors.sequential.RdBu
)
fig.update_traces(textinfo='label+percent+value', textfont_size=13)
fig.show()

  loan_grade  count
0          A   9402
1          B   9151
2          C   5699
3          D   3248
4          E    870
5          F    209
6          G     59


In [None]:
|

In [95]:
# create dummy variables for non-numerical values and then cast them as integers
dummy = pd.get_dummies(cdr_data_clean,dtype=int)
dummy.corr()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
person_age,1.0,0.178899,0.165787,0.054172,0.011019,-0.023333,-0.041065,0.859544,0.034111,-0.007484,...,-0.009734,-0.005919,-0.007165,0.003515,0.01157,0.000494,0.009037,0.004283,-0.007765,0.007765
person_income,0.178899,1.0,0.136825,0.265879,-0.001381,-0.139938,-0.251511,0.117076,0.20077,0.01144,...,0.002768,0.004253,0.001617,-0.006407,-0.013805,0.012844,0.018467,0.008996,0.002709,-0.002709
person_emp_length,0.165787,0.136825,1.0,0.110759,-0.056405,-0.082638,-0.055167,0.146336,0.22219,-0.013098,...,0.011221,0.056538,-0.00824,-0.043296,-0.006987,-0.016853,-0.009858,0.011579,0.027863,-0.027863
loan_amnt,0.054172,0.265879,0.110759,1.0,0.145904,0.11355,0.577399,0.045294,0.129524,0.01335,...,-0.000847,-0.119054,0.043321,-0.028026,0.069097,0.091518,0.076557,0.060912,-0.04032,0.04032
loan_int_rate,0.011019,-0.001381,-0.056405,0.145904,1.0,0.33935,0.1235,0.015371,-0.133444,0.017606,...,-0.008691,-0.799456,-0.007936,0.372469,0.47725,0.326998,0.200658,0.129607,-0.500254,0.500254
loan_status,-0.023333,-0.139938,-0.082638,0.11355,0.33935,1.0,0.379689,-0.015649,-0.184466,0.00984,...,-0.078644,-0.204428,-0.096187,-0.016435,0.325998,0.184503,0.10032,0.084541,-0.182019,0.182019
loan_percent_income,-0.041065,-0.251511,-0.055167,0.577399,0.1235,0.379689,1.0,-0.03008,-0.145138,0.010892,...,0.000821,-0.110316,0.03217,-5.1e-05,0.068357,0.058891,0.039898,0.036266,-0.03617,0.03617
cb_person_cred_hist_length,0.859544,0.117076,0.146336,0.045294,0.015371,-0.015649,-0.03008,1.0,0.022839,-0.006139,...,-0.006661,-0.010048,-0.004385,0.007678,0.009815,-0.002396,0.00824,0.006516,-0.008396,0.008396
person_home_ownership_MORTGAGE,0.034111,0.20077,0.22219,0.129524,-0.133444,-0.184466,-0.145138,0.022839,1.0,-0.048043,...,-0.019581,0.133196,-0.035889,-0.062088,-0.056696,-0.02129,0.004064,0.00577,0.061515,-0.061515
person_home_ownership_OTHER,-0.007484,0.01144,-0.013098,0.01335,0.017606,0.00984,0.010892,-0.006139,-0.048043,1.0,...,0.013805,-0.008919,-4.8e-05,-0.004138,0.010279,0.011185,0.009425,-0.002607,-0.009971,0.009971


In [96]:
# create important ratios for evaluation
dummy['loan_to_income_ratio'] = dummy['loan_amnt'] / dummy['person_income']
# Create loan-to-employment length ratio
dummy['loan_to_emp_length_ratio'] =  dummy['person_emp_length']/ dummy['loan_amnt'] 

# Create interest rate-to-loan amount ratio
dummy['int_rate_to_loan_amt_ratio'] = dummy['loan_int_rate'] / dummy['loan_amnt']

dummy

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y,loan_to_income_ratio,loan_to_emp_length_ratio,int_rate_to_loan_amt_ratio
0,22,59000,123.0,35000,16.02,1,0.59,3,0,0,...,0,1,0,0,0,0,1,0.593220,0.003514,0.000458
1,21,9600,5.0,1000,11.14,0,0.10,2,0,0,...,0,0,0,0,0,1,0,0.104167,0.005000,0.011140
2,25,9600,1.0,5500,12.87,1,0.57,3,1,0,...,1,0,0,0,0,1,0,0.572917,0.000182,0.002340
3,23,65500,4.0,35000,15.23,1,0.53,2,0,0,...,1,0,0,0,0,1,0,0.534351,0.000114,0.000435
4,24,54400,8.0,35000,14.27,1,0.55,4,0,0,...,1,0,0,0,0,0,1,0.643382,0.000229,0.000408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0,0.11,30,1,0,...,1,0,0,0,0,1,0,0.109434,0.000172,0.002269
32577,54,120000,4.0,17625,7.49,0,0.15,19,1,0,...,0,0,0,0,0,1,0,0.146875,0.000227,0.000425
32578,65,76000,3.0,35000,10.99,1,0.46,28,0,0,...,0,0,0,0,0,1,0,0.460526,0.000086,0.000314
32579,56,150000,5.0,15000,11.48,0,0.10,26,1,0,...,0,0,0,0,0,1,0,0.100000,0.000333,0.000765


KeyError: 'loan_grade'

In [75]:
# Pre process and create a copy of the dataset and improve the accuracy of the model by dropping the outcome to prevent bias
# also ensure the data is the same shape so that it can be trained
train_dat = dummy.copy()
X = train_dat.drop(['loan_status'],axis=1)
Y = cdr_data_clean['loan_status']
X.shape,Y.shape 

((28638, 29), (28638,))