In [1]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd 
pd.set_option('display.max_columns', None)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE

import statsmodels.api as sm 

import pickle

In [2]:
df = pd.read_csv('../../cleaned_accepted_2007_to_2015Q4.csv.gz')
df.drop(columns='Unnamed: 0', inplace=True)

In [3]:
#term 0 is 36 months term = 1 is 60 months
df = df[df['term'] == 0]

In [4]:
# 0 is fully paid 1 is defaulted
df.loan_status.value_counts()

0    534414
1     86751
Name: loan_status, dtype: int64

## Feature Selection

In [5]:
X = df[['issue_month', 'issue_year', 'annual_inc', 'dti',
       'emp_length', 'funded_amnt', 'inq_fi', 'inq_last_12m', 'installment',
       'int_rate', 'loan_amnt', 'open_acc', 'percent_bc_gt_75',
       'pub_rec', 'pub_rec_bankruptcies', 'tax_liens', 'total_acc',
       'revol_bal', 'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq',
       'mths_since_last_major_derog', 'cr_yeargap', 'avg_fico_score']].copy()

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

                        feature          VIF
0                   issue_month     5.265571
1                    issue_year  1101.792860
2                    annual_inc     2.831972
3                           dti     6.640145
4                    emp_length     4.328454
5                   funded_amnt  5514.609076
6                        inq_fi    28.506859
7                  inq_last_12m    52.508108
8                   installment  1076.321606
9                      int_rate    40.932553
10                    loan_amnt  4468.142384
11                     open_acc    11.943073
12             percent_bc_gt_75     3.907924
13                      pub_rec     9.324872
14         pub_rec_bankruptcies     4.536905
15                    tax_liens     5.072389
16                    total_acc    11.938409
17                    revol_bal     1.950430
18                  delinq_amnt     1.001628
19                  delinq_2yrs     1.267585
20       mths_since_last_delinq     2.256818
21  mths_s

In [6]:
# try model still including 'int_rate', 'avg_fico_score', 'loan_amnt', 'pub_rec'

X_trim= df[['issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75',
       'pub_rec_bankruptcies', 'tax_liens',
       'revol_bal', 'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq',
       'mths_since_last_major_derog', 'cr_yeargap']].copy()

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X_trim.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X_trim.values, i) 
                          for i in range(len(X_trim.columns))] 
  
print(vif_data)

                        feature       VIF
0                   issue_month  4.385874
1                    annual_inc  2.422330
2                           dti  5.418240
3                    emp_length  4.142056
4                      int_rate  8.169003
5              percent_bc_gt_75  3.355864
6          pub_rec_bankruptcies  1.170883
7                     tax_liens  1.022001
8                     revol_bal  1.820326
9                   delinq_amnt  1.001499
10                  delinq_2yrs  1.179249
11       mths_since_last_delinq  2.182675
12  mths_since_last_major_derog  1.811953
13                   cr_yeargap  5.747544


In [7]:
X = df[['loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
       'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
       'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
       'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
       'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
       'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
       'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
       'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
       'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y = df['loan_status']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, shuffle=True)


In [8]:
logistic = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic.fit(X_train, y_train)




LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [9]:
print('accuracy: ', logistic.score(X_test, y_test))

predictions = logistic.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)


accuracy:  0.6544396416411099
[[74504 32258]
 [10672  6799]]


In [10]:
for i, c in enumerate(X.columns):
    print(c, logistic.coef_[:,i])

loan_amnt [1.52794305e-05]
issue_month [1.3052623e-07]
annual_inc [-3.09869119e-06]
dti [8.90213205e-07]
emp_length [1.00358843e-07]
int_rate [8.76849314e-07]
percent_bc_gt_75 [2.75103051e-06]
pub_rec_bankruptcies [3.59345094e-09]
tax_liens [0.]
revol_bal [-2.34768475e-06]
delinq_amnt [1.77719904e-06]
delinq_2yrs [1.29828641e-08]
mths_since_last_delinq [5.50186681e-07]
mths_since_last_major_derog [7.5089702e-07]
cr_yeargap [4.44763149e-08]
ownership_0 [-1.22993673e-08]
ownership_1 [2.69852906e-08]
ownership_2 [0.]
purpose_0 [1.07821747e-08]
purpose_1 [-3.31003588e-09]
purpose_10 [0.]
purpose_11 [0.]
purpose_12 [0.]
purpose_13 [0.]
purpose_2 [0.]
purpose_4 [0.]
purpose_5 [0.]
purpose_6 [0.]
purpose_7 [0.]
purpose_8 [0.]
purpose_9 [0.]
sub_grade_A1 [-2.39043523e-09]
sub_grade_A2 [-1.13257252e-09]
sub_grade_A3 [-6.04818444e-10]
sub_grade_A4 [-2.38903561e-09]
sub_grade_A5 [-2.17875409e-09]
sub_grade_B1 [0.]
sub_grade_B2 [0.]
sub_grade_B3 [0.]
sub_grade_B4 [0.]
sub_grade_B5 [0.]
sub_grade_C

In [11]:
r = permutation_importance(logistic, X_test, y_test,
                          n_repeats=30,
                          random_state=27)

for i in r.importances_mean.argsort()[::-1]:
    if abs(r.importances_mean[i]) - 2*r.importances_std[i] > 0:
        print(f"{X.columns[i]:}"
             f"{r.importances_mean[i]:.3f}"
             f" +/- {r.importances_std[i]:.3f}"
             )

annual_inc0.036 +/- 0.001
loan_amnt0.034 +/- 0.001
int_rate-0.000 +/- 0.000
revol_bal-0.004 +/- 0.001


In [12]:
# selector = RFE(logistic, n_features_to_select=11, step=3)
# selector.fit(X,y)

In [13]:
# print("Features sorted by their rank:")
# sorted(zip(map(lambda x: round(x, 4), selector.ranking_), X.columns))

In [14]:
X = df[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
       'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
       'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
       'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
       'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
       'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
       'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
       'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
       'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y = df['loan_status']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, shuffle=True)

In [15]:
logistic = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic.fit(X_train, y_train)




LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [16]:
print('accuracy: ', logistic.score(X_test, y_test))

predictions = logistic.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)


accuracy:  0.6544396416411099
[[74504 32258]
 [10672  6799]]


In [18]:
# # Grid search cross validation
# grid={'C':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1], 'l1_ratio':[0, 0.2, 0.4, 0.6, 0.8, 1]}

# logistic = LogisticRegression(solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_cv = GridSearchCV(logistic, grid, cv=5)
# logistic_cv.fit(X_train, y_train)

# print('Tuned hyperparameters : (best parameters) ', logistic_cv.best_params_)
# print('accuracy: ', logistic_cv.best_score_)



# no usable results as accuracy score does not give enough information. null model gets 80% accuracy.


## Model Loan Grades

In [19]:
df_AB = df[df['grade'].isin([0,1])]
df_AB.sample(8)

Unnamed: 0,term,grade,issue_month,issue_year,annual_inc,dti,emp_length,funded_amnt,inq_fi,inq_last_12m,installment,int_rate,loan_status,loan_amnt,open_acc,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,tax_liens,total_acc,revol_bal,delinq_amnt,delinq_2yrs,mths_since_last_delinq,mths_since_last_major_derog,cr_yeargap,avg_fico_score,ownership_0,ownership_1,ownership_2,purpose_0,purpose_1,purpose_10,purpose_11,purpose_12,purpose_13,purpose_2,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,sub_grade_A1,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5
461946,0,0,10.0,2014.0,80000.0,24.29,5,9000.0,0.943945,2.234091,283.65,8.39,0,9000.0,14.0,50.0,0.0,0.0,0.0,42.0,9746.0,0.0,0.0,0.0,0.0,14.0,742.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
771111,0,1,7.0,2013.0,60000.0,20.6,7,10000.0,0.943945,2.234091,321.32,9.71,0,10000.0,11.0,42.9,0.0,0.0,0.0,16.0,12972.0,0.0,0.0,75.0,75.0,24.0,672.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
139074,0,0,9.0,2015.0,65000.0,17.58,1,10000.0,0.943945,2.234091,305.31,6.24,1,10000.0,19.0,0.0,0.0,0.0,0.0,38.0,9962.0,0.0,0.0,26.0,40.0,28.0,727.0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
403094,0,1,1.0,2015.0,59400.0,15.15,2,14400.0,0.943945,2.234091,474.45,11.44,0,14400.0,41.0,44.4,0.0,0.0,0.0,66.0,13220.0,0.0,0.0,0.0,0.0,10.0,667.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
667412,0,0,7.0,2011.0,33000.0,14.95,7,14000.0,0.943945,2.234091,435.43,7.49,0,14000.0,6.0,49.74199,0.0,0.0,0.0,28.0,6768.0,0.0,0.0,0.0,0.0,15.0,737.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
689950,0,1,1.0,2010.0,7200.0,9.5,1,1000.0,0.943945,2.234091,32.81,11.14,0,1000.0,5.0,49.74199,0.0,0.0,0.0,7.0,1552.0,0.0,0.0,0.0,0.0,14.0,722.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
450456,0,1,11.0,2014.0,87000.0,19.18,3,12000.0,0.943945,2.234091,379.76,8.67,0,12000.0,17.0,20.0,0.0,0.0,0.0,28.0,17179.0,0.0,0.0,0.0,0.0,5.0,692.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
668156,0,0,7.0,2011.0,110000.0,14.0,3,9000.0,0.943945,2.234091,279.92,7.49,0,9000.0,9.0,49.74199,0.0,0.0,0.0,24.0,19503.0,0.0,1.0,5.0,0.0,24.0,722.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
X_AB = df_AB[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
           'percent_bc_gt_75', 'pub_rec_bankruptcies', 
           'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
           'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
           'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
           'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
           'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
           'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
           'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
           'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
           'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
           'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
           'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
           'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
           'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
           'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y_AB = df_AB['loan_status']

X_train_AB, X_test_AB, y_train_AB, y_test_AB = train_test_split(X_AB, y_AB, test_size=0.2, random_state=27, shuffle=True)

In [21]:
logistic_AB = LogisticRegression(max_iter=1000, C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)

logistic_AB.fit(X_train_AB, y_train_AB)




LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=1000, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [22]:
print('accuracy: ', logistic_AB.score(X_test_AB, y_test_AB))

predictions_AB = logistic_AB.predict(X_test_AB)
cm_AB = confusion_matrix(y_test_AB, predictions_AB)
print(cm_AB)

accuracy:  0.7117689306989254
[[48682 15976]
 [ 4491  1860]]


In [23]:
df_CD = df[df['grade'].isin([2,3])]

X_CD = df_CD[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
           'percent_bc_gt_75', 'pub_rec_bankruptcies', 
           'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
           'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
           'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
           'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
           'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
           'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
           'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
           'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
           'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
           'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
           'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
           'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
           'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
           'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y_CD = df_CD['loan_status']

X_train_CD, X_test_CD, y_train_CD, y_test_CD = train_test_split(X_CD, y_CD, test_size=0.2, random_state=27, shuffle=True)

In [24]:
logistic_CD = LogisticRegression(max_iter=1000, C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)

logistic_CD.fit(X_train_CD, y_train_CD)




LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=1000, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [25]:
print('accuracy: ', logistic_CD.score(X_test_CD, y_test_CD))

predictions_CD = logistic_CD.predict(X_test_CD)
cm_CD = confusion_matrix(y_test_CD, predictions_CD)
print(cm_CD)

accuracy:  0.5664910299836908
[[22442 15928]
 [ 4805  4651]]


In [26]:
df_EFG = df[df['grade'].isin([4,5,6])]

X_EFG = df_EFG[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
               'percent_bc_gt_75', 'pub_rec_bankruptcies', 
               'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
               'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
               'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
               'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
               'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
               'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
               'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
               'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
               'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
               'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
               'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
               'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
               'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
               'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y_EFG = df_EFG['loan_status']
    
X_train_EFG, X_test_EFG, y_train_EFG, y_test_EFG = train_test_split(X_EFG, y_EFG, test_size=0.2, random_state=27, shuffle=True)

In [27]:
logistic_EFG = LogisticRegression(max_iter=1000, C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)

logistic_EFG.fit(X_train_EFG, y_train_EFG)


LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=1000, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [28]:
print('accuracy: ', logistic_EFG.score(X_test_EFG, y_test_EFG))

predictions_EFG = logistic_EFG.predict(X_test_EFG)
cm_EFG = confusion_matrix(y_test_EFG, predictions_EFG)
print(cm_EFG)

accuracy:  0.505
[[1772 2047]
 [ 626  955]]


In [29]:
df_A = df[df['grade'].isin([0])]

X_A = df_A[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
           'percent_bc_gt_75', 'pub_rec_bankruptcies', 
           'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
           'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
           'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
           'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
           'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
           'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
           'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
           'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
           'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
           'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
           'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
           'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
           'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
           'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y_A = df_A['loan_status']
    
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A, y_A, test_size=0.2, random_state=27, shuffle=True)

In [30]:
logistic_A = LogisticRegression(max_iter=1000, C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_A.fit(X_train_A, y_train_A)



LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=1000, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [31]:
print('accuracy: ', logistic_A.score(X_test_A, y_test_A))

predictions_A = logistic_A.predict(X_test_A)
cm_A = confusion_matrix(y_test_A, predictions_A)
print(cm_A)

accuracy:  0.7364005034260942
[[20616  6441]
 [ 1099   448]]


In [32]:
df_B = df[df['grade'].isin([1])]

X_B = df_B[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
           'percent_bc_gt_75', 'pub_rec_bankruptcies', 
           'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
           'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
           'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
           'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
           'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
           'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
           'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
           'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
           'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
           'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
           'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
           'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
           'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
           'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y_B = df_B['loan_status']
    
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B, y_B, test_size=0.2, random_state=27, shuffle=True)

In [33]:
logistic_B = LogisticRegression(max_iter=1000, C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_B.fit(X_train_B, y_train_B)



LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=1000, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [34]:
print('accuracy: ', logistic_B.score(X_test_B, y_test_B))

predictions_B = logistic_B.predict(X_test_B)
cm_B = confusion_matrix(y_test_B, predictions_B)
print(cm_B)

accuracy:  0.6494517155995755
[[25808 11887]
 [ 2978  1732]]


In [35]:
df_C = df[df['grade'].isin([2])]

X_C = df_C[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
           'percent_bc_gt_75', 'pub_rec_bankruptcies', 
           'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
           'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
           'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
           'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
           'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
           'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
           'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
           'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
           'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
           'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
           'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
           'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
           'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
           'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y_C = df_C['loan_status']
    
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C, y_C, test_size=0.2, random_state=27, shuffle=True)

In [36]:
logistic_C = LogisticRegression(max_iter=1000, C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_C.fit(X_train_C, y_train_C)



LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=1000, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [37]:
print('accuracy: ', logistic_C.score(X_test_C, y_test_C))

predictions_C = logistic_C.predict(X_test_C)
cm_C = confusion_matrix(y_test_C, predictions_C)
print(cm_C)

accuracy:  0.5900640236394976
[[16641  9914]
 [ 3404  2529]]


In [38]:
df_D = df[df['grade'].isin([3])]

X_D = df_D[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
           'percent_bc_gt_75', 'pub_rec_bankruptcies', 
           'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
           'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
           'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
           'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
           'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
           'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
           'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
           'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
           'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
           'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
           'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
           'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
           'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
           'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y_D = df_D['loan_status']
    
X_train_D, X_test_D, y_train_D, y_test_D = train_test_split(X_D, y_D, test_size=0.2, random_state=27, shuffle=True)

In [39]:
logistic_D = LogisticRegression(max_iter=1000, C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_D.fit(X_train_D, y_train_D)

LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=1000, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [40]:
print('accuracy: ', logistic_D.score(X_test_D, y_test_D))

predictions_D = logistic_D.predict(X_test_D)
cm_D = confusion_matrix(y_test_D, predictions_D)
print(cm_D)

accuracy:  0.5387574157376621
[[6228 5477]
 [1598 2036]]


# Save and export different models

In [41]:
PIK = "logistic_A.pickle"

with open(PIK, "wb") as f:
    pickle.dump([logistic_A, logistic_B, logistic_C, logistic_D, logistic_EFG], f)

## EFG loans are better predicted by group model instead of individual

In [42]:
# df_E = df[df['grade'].isin([4])]

# X_E = df_E[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
#        'percent_bc_gt_75', 'pub_rec_bankruptcies', 
#        'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
#        'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
#        'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
#        'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
#        'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
#        'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
#        'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
#        'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
#        'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
#        'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
#        'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
#        'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
#        'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
#        'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
# y_E = df_E['loan_status']

# X_train_E, X_test_E, y_train_E, y_test_E = train_test_split(X_E, y_E, test_size=0.2, random_state=27, shuffle=True)

In [43]:
# logistic_E = LogisticRegression(C=1e-05, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_E.fit(X_train_E, y_train_E)

In [44]:
# print('accuracy: ', logistic_E.score(X_test_E, y_test_E))

# predictions_E = logistic_E.predict(X_test_E)
# cm_E = confusion_matrix(y_test_E, predictions_E)
# print(cm_E)

In [45]:
# df_F = df[df['grade'].isin([5])]

# X_F = df_F[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
#        'percent_bc_gt_75', 'pub_rec_bankruptcies', 
#        'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
#        'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
#        'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
#        'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
#        'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
#        'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
#        'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
#        'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
#        'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
#        'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
#        'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
#        'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
#        'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
#        'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
# y_F = df_F['loan_status']
    
# X_train_F, X_test_F, y_train_F, y_test_F = train_test_split(X_F, y_F, test_size=0.2, random_state=27, shuffle=True)

In [46]:
# logistic_F = LogisticRegression(C=1e-05, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_F.fit(X_train_F, y_train_F)

In [47]:
# print('accuracy: ', logistic_F.score(X_test_F, y_test_F))

# predictions_F = logistic_F.predict(X_test_F)
# cm_F = confusion_matrix(y_test_F, predictions_F)
# print(cm_F)

In [48]:
# df_G = df[df['grade'].isin([6])]

# X_G = df_G[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
#        'percent_bc_gt_75', 'pub_rec_bankruptcies', 
#        'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
#        'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
#        'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
#        'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
#        'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
#        'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
#        'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
#        'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
#        'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
#        'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
#        'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
#        'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
#        'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
#        'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
# y_G = df_G['loan_status']
    
# X_train_G, X_test_G, y_train_G, y_test_G = train_test_split(X_G, y_G, test_size=0.2, random_state=27, shuffle=True)

In [49]:
# logistic_G = LogisticRegression(C=1e-05, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_G.fit(X_train_G, y_train_G)

In [50]:
# print('accuracy: ', logistic_G.score(X_test_G, y_test_G))

# predictions_G = logistic_G.predict(X_test_G)
# cm_G = confusion_matrix(y_test_G, predictions_G)
# print(cm_G)