In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, classification_report)
 

# Loading the Data and Selecting the Features

In [2]:
x_train = pd.read_csv('loan_data_inputs_train.csv')
y_train = pd.read_csv('loan_data_targets_train.csv')
x_test = pd.read_csv('loan_data_inputs_test.csv')
y_test = pd.read_csv('loan_data_targets_test.csv')

In [3]:
print("Train data:", x_train.shape, y_train.shape)
print("Test data:", x_test.shape, y_test.shape)

Train data: (40423, 77) (40423, 1)
Test data: (13475, 77) (13475, 1)


In [4]:
y_train = y_train.values.reshape(-1,)
y_test = y_test.values.reshape(-1,)

In [5]:
print("Train data:", x_train.shape, y_train.shape)
print("Test data:", x_test.shape, y_test.shape)

Train data: (40423, 77) (40423,)
Test data: (13475, 77) (13475,)


In [6]:
x_train.head()

Unnamed: 0,IncomeRange,IncomeVerifiable,IsBorrowerHomeowner,DebtToIncomeRatio,AvailableBankcardCredit,EmploymentStatus,LoanStatus,Term,LoanOriginalAmount,ProsperRating (Alpha),...,DebtToIncomeRatio:0.244-0.296,DebtToIncomeRatio:0.296-0.4,DebtToIncomeRatio>0.4,AvailableBankcardCreditFactor,AvailableBankcardCredit:<8.5k,AvailableBankcardCredit:8.5k-15.5k,AvailableBankcardCredit:15.5k-21k,AvailableBankcardCredit:21k-25.5k,AvailableBankcardCredit:25.5k-30k,AvailableBankcardCredit>30k
0,"$25,000-49,999",True,False,0.07,1645.0,Employed,Chargedoff,60,3500,E,...,0,0,0,"(-498.374, 9967.48]",1,0,0,0,0,0
1,"$100,000+",True,True,0.23,10704.0,Employed,Current,60,15000,A,...,0,0,0,"(9967.48, 19934.96]",0,1,0,0,0,0
2,"$100,000+",True,False,0.18,24063.0,Employed,Current,60,25000,A,...,0,0,0,"(19934.96, 29902.44]",0,0,0,1,0,0
3,"$50,000-74,999",True,False,0.2,47304.0,Employed,Current,36,4000,D,...,0,0,0,"(39869.92, 49837.4]",0,0,0,0,0,1
4,"$50,000-74,999",True,False,0.12,2630.0,Employed,Current,60,15000,C,...,0,0,0,"(-498.374, 9967.48]",1,0,0,0,0,0


In [7]:
x_test.head()

Unnamed: 0,IncomeRange,IncomeVerifiable,IsBorrowerHomeowner,DebtToIncomeRatio,AvailableBankcardCredit,EmploymentStatus,LoanStatus,Term,LoanOriginalAmount,ProsperRating (Alpha),...,DebtToIncomeRatio:0.244-0.296,DebtToIncomeRatio:0.296-0.4,DebtToIncomeRatio>0.4,AvailableBankcardCreditFactor,AvailableBankcardCredit:<8.5k,AvailableBankcardCredit:8.5k-15.5k,AvailableBankcardCredit:15.5k-21k,AvailableBankcardCredit:21k-25.5k,AvailableBankcardCredit:25.5k-30k,AvailableBankcardCredit>30k
0,"$25,000-49,999",True,False,0.35,0.0,Employed,Current,36,2500,HR,...,0,1,0,"(-360.0, 7200.0]",1,0,0,0,0,0
1,"$1-24,999",False,False,,115.0,Self-employed,Current,60,12000,D,...,0,0,0,"(-360.0, 7200.0]",1,0,0,0,0,0
2,"$50,000-74,999",True,True,0.42,12145.0,Employed,Current,36,15000,C,...,0,0,1,"(7200.0, 14400.0]",0,1,0,0,0,0
3,"$75,000-99,999",True,False,0.13,200.0,Employed,Current,36,4000,B,...,0,0,0,"(-360.0, 7200.0]",1,0,0,0,0,0
4,"$75,000-99,999",True,False,0.18,378.0,Employed,Completed,36,4000,D,...,0,0,0,"(-360.0, 7200.0]",1,0,0,0,0,0


### Selecting the Features

In [8]:
x_train.columns.values

array(['IncomeRange', 'IncomeVerifiable', 'IsBorrowerHomeowner',
       'DebtToIncomeRatio', 'AvailableBankcardCredit', 'EmploymentStatus',
       'LoanStatus', 'Term', 'LoanOriginalAmount',
       'ProsperRating (Alpha)', 'ProsperScore', 'BorrowerRate',
       'CreditScoreRange', 'IncomeRange:$0', 'IncomeRange:$1-24,999',
       'IncomeRange:$100,000+', 'IncomeRange:$25,000-49,999',
       'IncomeRange:$50,000-74,999', 'IncomeRange:$75,000-99,999',
       'IncomeRange:Not employed', 'EmploymentStatus:Employed',
       'EmploymentStatus:Full-time', 'EmploymentStatus:Not employed',
       'EmploymentStatus:Other', 'EmploymentStatus:Part-time',
       'EmploymentStatus:Retired', 'EmploymentStatus:Self-employed',
       'CreditScoreRange:Exceptional (800+)',
       'CreditScoreRange:Fair (600-679)',
       'CreditScoreRange:Good (680-739)',
       'CreditScoreRange:Very Good (740-799)', 'IncomeVerifiable:False',
       'IncomeVerifiable:True', 'IsBorrowerHomeowner:False',
       'IsBorrow

In [9]:
x_train_with_ref_cat = x_train[['IncomeRange:$0', 'IncomeRange:$1-24,999',
       'IncomeRange:$100,000+', 'IncomeRange:$25,000-49,999',
       'IncomeRange:$50,000-74,999', 'IncomeRange:$75,000-99,999',
       'IncomeRange:Not employed', 'EmploymentStatus:Employed',
       'CreditScoreRange:Exceptional (800+)',
       'CreditScoreRange:Fair (600-679)',
       'CreditScoreRange:Good (680-739)',
       'CreditScoreRange:Very Good (740-799)', 'IncomeVerifiable:False',
       'IncomeVerifiable:True', 'IsBorrowerHomeowner:False',
       'IsBorrowerHomeowner:True', 'ProsperRating (Alpha):A',
       'ProsperRating (Alpha):AA', 'ProsperRating (Alpha):B',
       'ProsperRating (Alpha):C', 'ProsperRating (Alpha):D',
       'ProsperRating (Alpha):E', 'ProsperRating (Alpha):HR',
       'EmploymentStatus:Other__self_employed__full_time',
       'EmploymentStatus:Not_employed__part_time__retired', 'Term:12',
       'Term:36', 'Term:60', 'ProsperScore:1', 'ProsperScore:2-5',
       'ProsperScore:6-8', 'ProsperScore:9-11',
       'BorrowerRate:<0.111', 'BorrowerRate:0.111, 0.188',
       'BorrowerRate:0.188, 0.243', 'BorrowerRate:0.243-0.292',
       'BorrowerRate:>0.292',
       'LoanOriginalAmount:<7.5K', 'LoanOriginalAmount:7.5K-15K',
       'LoanOriginalAmount:15K-20.4K', 'LoanOriginalAmount:20.4K-24.3k',
       'LoanOriginalAmount>24.3k',
       'DebtToIncomeRatio:Missing', 'DebtToIncomeRatio:<0.088',
       'DebtToIncomeRatio:0.088-0.244', 'DebtToIncomeRatio:0.244-0.296',
       'DebtToIncomeRatio:0.296-0.4', 'DebtToIncomeRatio>0.4',
        'AvailableBankcardCredit:<8.5k',
       'AvailableBankcardCredit:8.5k-15.5k',
       'AvailableBankcardCredit:15.5k-21k',
       'AvailableBankcardCredit:21k-25.5k',
       'AvailableBankcardCredit:25.5k-30k', 'AvailableBankcardCredit>30k']]




ref_categories = ['IncomeRange:$0', 'EmploymentStatus:Not_employed__part_time__retired','CreditScoreRange:Fair (600-679)',
                 'Term:60','IsBorrowerHomeowner:False','IncomeVerifiable:False',
                  'BorrowerRate:<0.111','ProsperRating (Alpha):HR','LoanOriginalAmount:<7.5K',
                  'DebtToIncomeRatio:<0.088', 'AvailableBankcardCredit:<8.5k','ProsperScore:1',
                 'EmploymentStatus:Not_employed__part_time__retired']

In [10]:
inputs_train = x_train_with_ref_cat.drop(ref_categories, axis = 1)
inputs_train.head()

Unnamed: 0,"IncomeRange:$1-24,999","IncomeRange:$100,000+","IncomeRange:$25,000-49,999","IncomeRange:$50,000-74,999","IncomeRange:$75,000-99,999",IncomeRange:Not employed,EmploymentStatus:Employed,CreditScoreRange:Exceptional (800+),CreditScoreRange:Good (680-739),CreditScoreRange:Very Good (740-799),...,DebtToIncomeRatio:Missing,DebtToIncomeRatio:0.088-0.244,DebtToIncomeRatio:0.244-0.296,DebtToIncomeRatio:0.296-0.4,DebtToIncomeRatio>0.4,AvailableBankcardCredit:8.5k-15.5k,AvailableBankcardCredit:15.5k-21k,AvailableBankcardCredit:21k-25.5k,AvailableBankcardCredit:25.5k-30k,AvailableBankcardCredit>30k
0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [11]:
inputs_train.dtypes

IncomeRange:$1-24,999                               int64
IncomeRange:$100,000+                               int64
IncomeRange:$25,000-49,999                          int64
IncomeRange:$50,000-74,999                          int64
IncomeRange:$75,000-99,999                          int64
IncomeRange:Not employed                            int64
EmploymentStatus:Employed                           int64
CreditScoreRange:Exceptional (800+)                 int64
CreditScoreRange:Good (680-739)                     int64
CreditScoreRange:Very Good (740-799)                int64
IncomeVerifiable:True                               int64
IsBorrowerHomeowner:True                            int64
ProsperRating (Alpha):A                             int64
ProsperRating (Alpha):AA                            int64
ProsperRating (Alpha):B                             int64
ProsperRating (Alpha):C                             int64
ProsperRating (Alpha):D                             int64
ProsperRating 

# PD Model Estimation

## Logistic Regression

In [12]:
pd.options.display.max_rows = None

In [13]:
reg = LogisticRegression(max_iter=1000)

In [14]:
reg.fit(inputs_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
reg.intercept_

array([3.73856293])

In [16]:
reg.coef_

array([[-0.28137575,  0.62410333, -0.12600608,  0.23724793,  0.39538749,
        -0.05448987,  0.2548273 , -0.28312374, -0.05172125, -0.12580833,
        -0.4020771 , -0.01889747, -0.61510529, -0.12418433, -0.85494987,
        -0.52364589, -0.3855522 ,  0.40547315,  0.20630094, -0.29800285,
         0.12799154,  0.93148418,  0.51015126,  0.62441608, -0.37108272,
        -1.52114322, -2.4935476 , -3.5461717 , -0.10528762,  0.23134716,
        -0.14078701, -0.01424581, -0.42159445,  0.36625547,  0.45606472,
         0.52717141,  0.46164755, -0.23864509, -0.19829785, -0.30375529,
        -0.56554715, -0.28765929]])

In [17]:
feature_name = inputs_train.columns.values

In [18]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(reg.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,3.738563
1,"IncomeRange:$1-24,999",-0.281376
2,"IncomeRange:$100,000+",0.624103
3,"IncomeRange:$25,000-49,999",-0.126006
4,"IncomeRange:$50,000-74,999",0.237248
5,"IncomeRange:$75,000-99,999",0.395387
6,IncomeRange:Not employed,-0.05449
7,EmploymentStatus:Employed,0.254827
8,CreditScoreRange:Exceptional (800+),-0.283124
9,CreditScoreRange:Good (680-739),-0.051721


## Build a Logistic Regression Model with P-Values

Which independent variables contributes to predicting borrower default and which dont? The accepted approach is to check the statistical signficane of coeeficients of each dummy variables. We look at P-value. Alter the fit model

In [19]:
import statsmodels.api as sm

In [20]:
inputs_train['intercept']= 1
logit_mod =sm.Logit(y_train,inputs_train)

#fit the model
results = logit_mod.fit()

#get summary statistics
results.summary2() 

Optimization terminated successfully.
         Current function value: 0.222125
         Iterations 8


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.129
Dependent Variable:,y,AIC:,18043.9306
Date:,2022-03-03 13:51,BIC:,18414.0382
No. Observations:,40423,Log-Likelihood:,-8979.0
Df Model:,42,LL-Null:,-10308.0
Df Residuals:,40380,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,8.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
"IncomeRange:$1-24,999",0.0694,0.7172,0.0968,0.9229,-1.3363,1.4751
"IncomeRange:$100,000+",0.9817,0.7177,1.3678,0.1714,-0.4250,2.3884
"IncomeRange:$25,000-49,999",0.2258,0.7153,0.3157,0.7522,-1.1762,1.6278
"IncomeRange:$50,000-74,999",0.5893,0.7159,0.8231,0.4105,-0.8139,1.9924
"IncomeRange:$75,000-99,999",0.7496,0.7172,1.0453,0.2959,-0.6560,2.1553
IncomeRange:Not employed,0.3682,0.8274,0.4450,0.6563,-1.2535,1.9899
EmploymentStatus:Employed,0.3258,0.3977,0.8194,0.4126,-0.4536,1.1053
CreditScoreRange:Exceptional (800+),-0.2824,0.1528,-1.8483,0.0646,-0.5819,0.0171
CreditScoreRange:Good (680-739),-0.0513,0.0490,-1.0477,0.2948,-0.1474,0.0447


In [22]:
pvalue = pd.DataFrame(results.pvalues,columns={'p_value'},)
pvalue

Unnamed: 0,p_value
"IncomeRange:$1-24,999",0.922897
"IncomeRange:$100,000+",0.1713741
"IncomeRange:$25,000-49,999",0.752232
"IncomeRange:$50,000-74,999",0.4104509
"IncomeRange:$75,000-99,999",0.2958955
IncomeRange:Not employed,0.6563073
EmploymentStatus:Employed,0.4125764
CreditScoreRange:Exceptional (800+),0.06455433
CreditScoreRange:Good (680-739),0.2947553
CreditScoreRange:Very Good (740-799),0.09909538


In [23]:
pvs=[]
for i in range (0, len(pvalue["p_value"])):
    if pvalue['p_value'][i] < 0.05:
        pvs.append(pvalue.index[i])
    if 'const' in pvs:
         pvs.remove('const')
    else:
         pvs 
print(pvs)
print(len(pvs))

['ProsperRating (Alpha):A', 'ProsperRating (Alpha):B', 'ProsperRating (Alpha):C', 'ProsperRating (Alpha):D', 'ProsperRating (Alpha):E', 'Term:12', 'ProsperScore:2-5', 'ProsperScore:6-8', 'ProsperScore:9-11', 'BorrowerRate:0.111, 0.188', 'BorrowerRate:0.188, 0.243', 'BorrowerRate:0.243-0.292', 'BorrowerRate:>0.292', 'DebtToIncomeRatio:0.088-0.244', 'DebtToIncomeRatio:0.244-0.296', 'DebtToIncomeRatio:0.296-0.4', 'DebtToIncomeRatio>0.4', 'AvailableBankcardCredit:8.5k-15.5k', 'AvailableBankcardCredit:15.5k-21k', 'AvailableBankcardCredit:21k-25.5k', 'AvailableBankcardCredit:25.5k-30k', 'AvailableBankcardCredit>30k', 'intercept']
23


### Fitting the Logistic Regression Again

In [24]:
imput_train2 = inputs_train[pvs]

In [25]:
reg2 = LogisticRegression()

In [26]:
reg2.fit(imput_train2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
reg2.intercept_

array([3.85613908])

In [28]:
reg2.coef_

array([[-7.15974983e-01, -9.77716398e-01, -6.42021708e-01,
        -4.63262661e-01,  4.06650915e-01, -3.47375520e-01,
         8.04133961e-01,  3.69892671e-01,  5.44675412e-01,
        -2.93756041e-01, -1.48669750e+00, -2.54494931e+00,
        -3.59028516e+00,  4.96138765e-01,  5.17124089e-01,
         5.07855388e-01,  3.15931995e-01, -2.16332101e-01,
        -1.66559360e-01, -2.85053311e-01, -5.35630548e-01,
        -2.55943806e-01, -1.91997998e-03]])

In [29]:
feature_name2 = imput_train2.columns.values

In [30]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name2)
summary_table['Coefficients'] = np.transpose(reg2.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg2.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,3.856139
1,ProsperRating (Alpha):A,-0.715975
2,ProsperRating (Alpha):B,-0.977716
3,ProsperRating (Alpha):C,-0.642022
4,ProsperRating (Alpha):D,-0.463263
5,ProsperRating (Alpha):E,0.406651
6,Term:12,-0.347376
7,ProsperScore:2-5,0.804134
8,ProsperScore:6-8,0.369893
9,ProsperScore:9-11,0.544675


In [31]:
imput_train2['intercept']= 1
logit_mod =sm.Logit(y_train,imput_train2)

#fit the model
results = logit_mod.fit()

#get summary statistics
results.summary2() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Optimization terminated successfully.
         Current function value: 0.224679
         Iterations 8


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.119
Dependent Variable:,y,AIC:,18210.4289
Date:,2022-03-03 14:12,BIC:,18408.3934
No. Observations:,40423,Log-Likelihood:,-9082.2
Df Model:,22,LL-Null:,-10308.0
Df Residuals:,40400,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,8.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
ProsperRating (Alpha):A,-0.8500,0.1614,-5.2667,0.0000,-1.1663,-0.5337
ProsperRating (Alpha):B,-1.0682,0.1519,-7.0308,0.0000,-1.3660,-0.7705
ProsperRating (Alpha):C,-0.6914,0.1261,-5.4850,0.0000,-0.9385,-0.4444
ProsperRating (Alpha):D,-0.4936,0.1136,-4.3457,0.0000,-0.7162,-0.2710
ProsperRating (Alpha):E,0.4027,0.0758,5.3157,0.0000,0.2542,0.5512
Term:12,-0.4053,0.1856,-2.1833,0.0290,-0.7691,-0.0415
ProsperScore:2-5,0.8265,0.1309,6.3142,0.0000,0.5699,1.0830
ProsperScore:6-8,0.3979,0.1357,2.9330,0.0034,0.1320,0.6637
ProsperScore:9-11,0.5335,0.1751,3.0475,0.0023,0.1904,0.8766


In [32]:
pvalue = pd.DataFrame(results.pvalues,columns={'p_value'},)
pvalue

Unnamed: 0,p_value
ProsperRating (Alpha):A,1.388803e-07
ProsperRating (Alpha):B,2.052925e-12
ProsperRating (Alpha):C,4.133659e-08
ProsperRating (Alpha):D,1.388356e-05
ProsperRating (Alpha):E,1.062505e-07
Term:12,0.02901139
ProsperScore:2-5,2.715057e-10
ProsperScore:6-8,0.003357346
ProsperScore:9-11,0.00230755
"BorrowerRate:0.111, 0.188",0.003860662


In [33]:
import pickle

In [34]:
pickle.dump(reg2, open('pd_model.sav', 'wb'))
# Here we export our model to a 'SAV' file with file name 'pd_model.sav'.