In [50]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd 
pd.set_option('display.max_columns', None)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE

import statsmodels.api as sm 

In [51]:
df = pd.read_csv('../../cleaned_accepted_2007_to_2015Q4.csv.gz')
df.drop(columns='Unnamed: 0', inplace=True)

In [52]:
#term 0 is 36 months term = 1 is 60 months
df = df[df['term'] == 0]

In [53]:
df.head()

Unnamed: 0,term,grade,issue_month,issue_year,annual_inc,dti,emp_length,funded_amnt,inq_fi,inq_last_12m,installment,int_rate,loan_status,loan_amnt,open_acc,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,tax_liens,total_acc,revol_bal,delinq_amnt,delinq_2yrs,mths_since_last_delinq,mths_since_last_major_derog,cr_yeargap,avg_fico_score,ownership_0,ownership_1,ownership_2,purpose_0,purpose_1,purpose_10,purpose_11,purpose_12,purpose_13,purpose_2,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,sub_grade_A1,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5
0,0,2,12.0,2015.0,55000.0,5.91,10,3600.0,3.0,4.0,123.03,13.99,0,3600.0,7.0,0.0,0.0,0.0,0.0,13.0,2765.0,0.0,0.0,30.0,30.0,12.0,677.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,2,12.0,2015.0,65000.0,16.06,10,24700.0,0.0,6.0,820.28,11.99,0,24700.0,22.0,7.7,0.0,0.0,0.0,38.0,21470.0,0.0,1.0,6.0,0.0,16.0,717.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,2,12.0,2015.0,34000.0,10.2,4,11950.0,0.0,0.0,405.18,13.44,0,11950.0,5.0,100.0,0.0,0.0,0.0,6.0,8822.0,0.0,0.0,0.0,0.0,28.0,692.0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,12.0,2015.0,180000.0,14.67,10,20000.0,1.0,1.0,637.58,9.17,0,20000.0,12.0,100.0,0.0,0.0,0.0,27.0,87329.0,0.0,0.0,49.0,0.0,26.0,682.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,1,12.0,2015.0,85000.0,17.61,10,20000.0,1.0,2.0,631.26,8.49,0,20000.0,8.0,0.0,0.0,0.0,0.0,15.0,826.0,0.0,1.0,3.0,3.0,17.0,707.0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [54]:
# 0 is fully paid 1 is defaulted
df.loan_status.value_counts()

0    534414
1     86751
Name: loan_status, dtype: int64

In [55]:
df.isnull().sum()

term            0
grade           0
issue_month     0
issue_year      0
annual_inc      0
               ..
sub_grade_G1    0
sub_grade_G2    0
sub_grade_G3    0
sub_grade_G4    0
sub_grade_G5    0
Length: 78, dtype: int64

In [56]:
df.columns

Index(['term', 'grade', 'issue_month', 'issue_year', 'annual_inc', 'dti',
       'emp_length', 'funded_amnt', 'inq_fi', 'inq_last_12m', 'installment',
       'int_rate', 'loan_status', 'loan_amnt', 'open_acc', 'percent_bc_gt_75',
       'pub_rec', 'pub_rec_bankruptcies', 'tax_liens', 'total_acc',
       'revol_bal', 'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq',
       'mths_since_last_major_derog', 'cr_yeargap', 'avg_fico_score',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
       'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
       'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
       'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
       'sub_grade_D1', 'sub_grade_D2', 'sub_grade

## Feature Selection

In [57]:
X = df[['issue_month', 'issue_year', 'annual_inc', 'dti',
       'emp_length', 'funded_amnt', 'inq_fi', 'inq_last_12m', 'installment',
       'int_rate', 'loan_amnt', 'open_acc', 'percent_bc_gt_75',
       'pub_rec', 'pub_rec_bankruptcies', 'tax_liens', 'total_acc',
       'revol_bal', 'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq',
       'mths_since_last_major_derog', 'cr_yeargap', 'avg_fico_score']].copy()

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

                        feature          VIF
0                   issue_month     5.265571
1                    issue_year  1101.792860
2                    annual_inc     2.831972
3                           dti     6.640145
4                    emp_length     4.328454
5                   funded_amnt  5514.609076
6                        inq_fi    28.506859
7                  inq_last_12m    52.508108
8                   installment  1076.321606
9                      int_rate    40.932553
10                    loan_amnt  4468.142384
11                     open_acc    11.943073
12             percent_bc_gt_75     3.907924
13                      pub_rec     9.324872
14         pub_rec_bankruptcies     4.536905
15                    tax_liens     5.072389
16                    total_acc    11.938409
17                    revol_bal     1.950430
18                  delinq_amnt     1.001628
19                  delinq_2yrs     1.267585
20       mths_since_last_delinq     2.256818
21  mths_s

In [58]:
# try model still including 'int_rate', 'avg_fico_score', 'loan_amnt', 'pub_rec'

X_trim= df[['issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75',
       'pub_rec_bankruptcies', 'tax_liens',
       'revol_bal', 'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq',
       'mths_since_last_major_derog', 'cr_yeargap']].copy()

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X_trim.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X_trim.values, i) 
                          for i in range(len(X_trim.columns))] 
  
print(vif_data)

                        feature       VIF
0                   issue_month  4.385874
1                    annual_inc  2.422330
2                           dti  5.418240
3                    emp_length  4.142056
4                      int_rate  8.169003
5              percent_bc_gt_75  3.355864
6          pub_rec_bankruptcies  1.170883
7                     tax_liens  1.022001
8                     revol_bal  1.820326
9                   delinq_amnt  1.001499
10                  delinq_2yrs  1.179249
11       mths_since_last_delinq  2.182675
12  mths_since_last_major_derog  1.811953
13                   cr_yeargap  5.747544


In [59]:
X = df[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
       'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
       'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
       'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
       'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
       'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
       'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
       'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
       'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y = df['loan_status']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, shuffle=True)


In [11]:
logistic = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic.fit(X_train, y_train)




LogisticRegression(C=0.001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [12]:
print('accuracy: ', logistic.score(X_test, y_test))

predictions = logistic.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)


accuracy:  0.6544396416411099
[[74504 32258]
 [10672  6799]]


In [13]:
for i, c in enumerate(X.columns):
    print(c, logistic.coef_[:,i])

int_rate [8.83654999e-07]
loan_amnt [1.5279516e-05]
issue_month [1.37336824e-07]
annual_inc [-3.09880968e-06]
dti [8.97016237e-07]
emp_length [1.07169166e-07]
int_rate [8.83654999e-07]
percent_bc_gt_75 [2.75780934e-06]
pub_rec_bankruptcies [1.04105226e-08]
tax_liens [3.37642126e-09]
revol_bal [-2.34780041e-06]
delinq_amnt [1.7834945e-06]
delinq_2yrs [1.97991012e-08]
mths_since_last_delinq [5.56991639e-07]
mths_since_last_major_derog [7.57704903e-07]
cr_yeargap [5.12774379e-08]
ownership_0 [-1.91166883e-08]
ownership_1 [3.3802051e-08]
ownership_2 [3.37142381e-09]
purpose_0 [1.75989491e-08]
purpose_1 [-1.01273467e-08]
purpose_10 [0.]
purpose_11 [0.]
purpose_12 [0.]
purpose_13 [0.]
purpose_2 [0.]
purpose_4 [0.]
purpose_5 [2.54085637e-09]
purpose_6 [6.2395201e-10]
purpose_7 [0.]
purpose_8 [5.84256249e-10]
purpose_9 [0.]
sub_grade_A1 [-9.20762382e-09]
sub_grade_A2 [-7.94977007e-09]
sub_grade_A3 [-7.42201943e-09]
sub_grade_A4 [-9.20602137e-09]
sub_grade_A5 [-8.99571174e-09]
sub_grade_B1 [-6.

In [14]:
r = permutation_importance(logistic, X_test, y_test,
                          n_repeats=30,
                          random_state=27)

for i in r.importances_mean.argsort()[::-1]:
    if abs(r.importances_mean[i]) - 2*r.importances_std[i] > 0:
        print(f"{X.columns[i]:}"
             f"{r.importances_mean[i]:.3f}"
             f" +/- {r.importances_std[i]:.3f}"
             )

annual_inc0.036 +/- 0.001
loan_amnt0.034 +/- 0.001
revol_bal-0.004 +/- 0.001


In [15]:
# selector = RFE(logistic, n_features_to_select=11, step=3)
# selector.fit(X,y)



array([ 1,  1,  1,  1,  1,  2,  1,  1,  4, 10,  1,  1,  3,  1,  1,  2,  3,
        3, 11,  4,  4, 16, 16, 17, 17, 17, 19, 12, 14, 20, 14, 16,  5,  6,
        6,  5,  5,  8,  9, 10, 13, 14, 11,  9,  8,  7,  7,  6,  7,  8,  9,
       10, 11, 12, 12, 13, 13, 15, 15, 15, 18, 18, 18, 19, 19, 20, 20])

In [16]:
# selector.support_

array([ True,  True,  True,  True,  True, False,  True,  True, False,
       False,  True,  True, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [17]:
# selector.ranking_

array([ 1,  1,  1,  1,  1,  2,  1,  1,  4, 10,  1,  1,  3,  1,  1,  2,  3,
        3, 11,  4,  4, 16, 16, 17, 17, 17, 19, 12, 14, 20, 14, 16,  5,  6,
        6,  5,  5,  8,  9, 10, 13, 14, 11,  9,  8,  7,  7,  6,  7,  8,  9,
       10, 11, 12, 12, 13, 13, 15, 15, 15, 18, 18, 18, 19, 19, 20, 20])

In [37]:
# print("Features sorted by their rank:")
# sorted(zip(map(lambda x: round(x, 4), selector.ranking_), X.columns))

Features sorted by their rank:


[(1, 'annual_inc'),
 (1, 'delinq_amnt'),
 (1, 'dti'),
 (1, 'int_rate'),
 (1, 'int_rate'),
 (1, 'issue_month'),
 (1, 'loan_amnt'),
 (1, 'mths_since_last_delinq'),
 (1, 'mths_since_last_major_derog'),
 (1, 'percent_bc_gt_75'),
 (1, 'revol_bal'),
 (2, 'cr_yeargap'),
 (2, 'emp_length'),
 (3, 'delinq_2yrs'),
 (3, 'ownership_0'),
 (3, 'ownership_1'),
 (4, 'pub_rec_bankruptcies'),
 (4, 'purpose_0'),
 (4, 'purpose_1'),
 (5, 'sub_grade_A1'),
 (5, 'sub_grade_A4'),
 (5, 'sub_grade_A5'),
 (6, 'sub_grade_A2'),
 (6, 'sub_grade_A3'),
 (6, 'sub_grade_D1'),
 (7, 'sub_grade_C4'),
 (7, 'sub_grade_C5'),
 (7, 'sub_grade_D2'),
 (8, 'sub_grade_B1'),
 (8, 'sub_grade_C3'),
 (8, 'sub_grade_D3'),
 (9, 'sub_grade_B2'),
 (9, 'sub_grade_C2'),
 (9, 'sub_grade_D4'),
 (10, 'sub_grade_B3'),
 (10, 'sub_grade_D5'),
 (10, 'tax_liens'),
 (11, 'ownership_2'),
 (11, 'sub_grade_C1'),
 (11, 'sub_grade_E1'),
 (12, 'purpose_5'),
 (12, 'sub_grade_E2'),
 (12, 'sub_grade_E3'),
 (13, 'sub_grade_B4'),
 (13, 'sub_grade_E4'),
 (13, 'su

In [101]:
X = df[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
       'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
       'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
       'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
       'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
       'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
       'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
       'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
       'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y = df['loan_status']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, shuffle=True)

In [102]:
logistic = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic.fit(X_train, y_train)




LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [103]:
print('accuracy: ', logistic.score(X_test, y_test))

predictions = logistic.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)


accuracy:  0.6544396416411099
[[74504 32258]
 [10672  6799]]


In [104]:
X_train.head()

Unnamed: 0,int_rate,loan_amnt,issue_month,annual_inc,dti,emp_length,int_rate.1,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,revol_bal,delinq_amnt,delinq_2yrs,mths_since_last_delinq,mths_since_last_major_derog,cr_yeargap,ownership_0,ownership_1,ownership_2,purpose_0,purpose_1,purpose_10,purpose_11,purpose_12,purpose_13,purpose_2,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,sub_grade_A1,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5
473187,10.99,8000.0,10.0,65000.0,29.17,10,10.99,50.0,1.0,0.0,5238.0,0.0,0.0,47.0,57.0,29.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
666752,8.49,2250.0,8.0,54000.0,20.73,8,8.49,49.74199,0.0,0.0,396.0,0.0,0.0,0.0,0.0,12.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
876157,9.76,6500.0,4.0,33695.0,13.18,10,9.76,66.7,0.0,0.0,14516.0,0.0,0.0,79.0,0.0,36.0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
153371,10.99,21000.0,8.0,135000.0,5.74,10,10.99,100.0,0.0,0.0,6547.0,0.0,1.0,14.0,51.0,20.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
846125,21.49,29175.0,11.0,65000.0,20.98,1,21.49,66.7,0.0,0.0,25440.0,0.0,0.0,0.0,0.0,12.0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [105]:
# # Grid search cross validation
# grid={'C':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1], 'l1_ratio':[0, 0.2, 0.4, 0.6, 0.8, 1]}

# logistic = LogisticRegression(solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_cv = GridSearchCV(logistic, grid, cv=5)
# logistic_cv.fit(X_train, y_train)

# print('Tuned hyperparameters : (best parameters) ', logistic_cv.best_params_)
# print('accuracy: ', logistic_cv.best_score_)

# not usable results


## Model Loan Grades

In [106]:
df_AB = df[df['grade'].isin([0,1])]
df_AB.sample(8)

Unnamed: 0,term,grade,issue_month,issue_year,annual_inc,dti,emp_length,funded_amnt,inq_fi,inq_last_12m,installment,int_rate,loan_status,loan_amnt,open_acc,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,tax_liens,total_acc,revol_bal,delinq_amnt,delinq_2yrs,mths_since_last_delinq,mths_since_last_major_derog,cr_yeargap,avg_fico_score,ownership_0,ownership_1,ownership_2,purpose_0,purpose_1,purpose_10,purpose_11,purpose_12,purpose_13,purpose_2,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,sub_grade_A1,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5
245053,0,1,6.0,2015.0,52865.0,18.09,1,1200.0,0.943945,2.234091,39.59,11.53,0,1200.0,11.0,62.5,0.0,0.0,0.0,21.0,8851.0,0.0,1.0,22.0,22.0,13.0,692.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
815099,0,1,3.0,2013.0,41000.0,14.94,4,18000.0,0.943945,2.234091,590.5,11.14,1,18000.0,10.0,80.0,0.0,0.0,0.0,17.0,24579.0,0.0,0.0,0.0,0.0,9.0,702.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
377111,0,1,2.0,2015.0,66000.0,27.36,10,19050.0,0.943945,2.234091,614.61,9.99,0,19050.0,14.0,50.0,0.0,0.0,0.0,28.0,22787.0,0.0,0.0,46.0,46.0,21.0,702.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
710113,0,0,12.0,2013.0,64800.0,4.91,10,8200.0,0.943945,2.234091,260.38,8.9,0,8200.0,5.0,0.0,0.0,0.0,0.0,16.0,8744.0,0.0,0.0,0.0,0.0,18.0,797.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
387945,0,1,1.0,2015.0,82000.0,9.4,10,15000.0,0.943945,2.234091,494.22,11.44,0,15000.0,9.0,100.0,0.0,0.0,0.0,19.0,21952.0,0.0,2.0,20.0,0.0,26.0,667.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
585402,0,0,5.0,2014.0,70000.0,25.0,3,13600.0,0.943945,2.234091,423.8,7.62,0,13600.0,13.0,28.6,0.0,0.0,0.0,40.0,23030.0,0.0,0.0,0.0,0.0,16.0,732.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12977,0,1,12.0,2015.0,26400.0,22.27,7,5600.0,0.0,0.0,176.47,8.38,0,5600.0,3.0,100.0,0.0,0.0,0.0,10.0,6412.0,0.0,0.0,0.0,0.0,11.0,722.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
718091,0,1,11.0,2013.0,42000.0,16.6,1,10000.0,0.943945,2.234091,336.9,12.99,0,10000.0,11.0,100.0,0.0,0.0,0.0,24.0,14134.0,0.0,3.0,13.0,13.0,10.0,667.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [107]:
X_AB = df_AB[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
       'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
       'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5']].copy()
    
y_AB = df_AB['loan_status']

X_train_AB, X_test_AB, y_train_AB, y_test_AB = train_test_split(X_AB, y_AB, test_size=0.2, random_state=27, shuffle=True)

In [108]:
logistic_AB = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)

logistic_AB.fit(X_train_AB, y_train_AB)




LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [109]:
print('accuracy: ', logistic_AB.score(X_test_AB, y_test_AB))

predictions_AB = logistic_AB.predict(X_test_AB)
cm_AB = confusion_matrix(y_test_AB, predictions_AB)
print(cm_AB)

accuracy:  0.7125434804038925
[[48745 15913]
 [ 4499  1852]]


In [110]:
df_CD = df[df['grade'].isin([2,3])]

X_CD = df_CD[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9','sub_grade_C1', 'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
       'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4', 'sub_grade_D5']].copy()
    
y_CD = df_CD['loan_status']

X_train_CD, X_test_CD, y_train_CD, y_test_CD = train_test_split(X_CD, y_CD, test_size=0.2, random_state=27, shuffle=True)

In [111]:
logistic_CD = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)

logistic_CD.fit(X_train_CD, y_train_CD)




LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [112]:
print('accuracy: ', logistic_CD.score(X_test_CD, y_test_CD))

predictions_CD = logistic_CD.predict(X_test_CD)
cm_CD = confusion_matrix(y_test_CD, predictions_CD)
print(cm_CD)

accuracy:  0.5670137582068331
[[22466 15904]
 [ 4804  4652]]


In [113]:
df_EFG = df[df['grade'].isin([4,5,6])]

X_EFG = df_EFG[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
       'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
       'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
       'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
y_EFG = df_EFG['loan_status']
    
X_train_EFG, X_test_EFG, y_train_EFG, y_test_EFG = train_test_split(X_EFG, y_EFG, test_size=0.2, random_state=27, shuffle=True)

In [114]:
logistic_EFG = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)

logistic_EFG.fit(X_train_EFG, y_train_EFG)




LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [115]:
print('accuracy: ', logistic_EFG.score(X_test_EFG, y_test_EFG))

predictions_EFG = logistic_EFG.predict(X_test_EFG)
cm_EFG = confusion_matrix(y_test_EFG, predictions_EFG)
print(cm_EFG)

accuracy:  0.5670137582068331
[[22466 15904]
 [ 4804  4652]]


In [132]:
df_A = df[df['grade'].isin([0])]

X_A = df_A[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
       'sub_grade_A4', 'sub_grade_A5']].copy()
    
y_A = df_A['loan_status']
    
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A, y_A, test_size=0.2, random_state=27, shuffle=True)

In [133]:
logistic_A = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_A.fit(X_train_A, y_train_A)



LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [134]:
print('accuracy: ', logistic_A.score(X_test_A, y_test_A))

predictions_A = logistic_A.predict(X_test_A)
cm_A = confusion_matrix(y_test_A, predictions_A)
print(cm_A)

accuracy:  0.74010627884212
[[20731  6326]
 [ 1108   439]]


In [135]:
df_B = df[df['grade'].isin([1])]

X_B = df_B[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_B1', 'sub_grade_B2',
       'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', ]].copy()
    
y_B = df_B['loan_status']
    
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B, y_B, test_size=0.2, random_state=27, shuffle=True)

In [136]:
logistic_B = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_B.fit(X_train_B, y_train_B)



LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [137]:
print('accuracy: ', logistic_B.score(X_test_B, y_test_B))

predictions_B = logistic_B.predict(X_test_B)
cm_B = confusion_matrix(y_test_B, predictions_B)
print(cm_B)

accuracy:  0.6502535078410565
[[25848 11847]
 [ 2984  1726]]


In [138]:
df_C = df[df['grade'].isin([2])]

X_C = df_C[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9','sub_grade_C1', 'sub_grade_C2', 'sub_grade_C3', 
       'sub_grade_C4', 'sub_grade_C5',]].copy()
    
y_C = df_C['loan_status']
    
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C, y_C, test_size=0.2, random_state=27, shuffle=True)

In [139]:
logistic_C = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_C.fit(X_train_C, y_train_C)



LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [140]:
print('accuracy: ', logistic_C.score(X_test_C, y_test_C))

predictions_C = logistic_C.predict(X_test_C)
cm_C = confusion_matrix(y_test_C, predictions_C)
print(cm_C)

accuracy:  0.5912644668800788
[[16693  9862]
 [ 3417  2516]]


In [141]:
df_D = df[df['grade'].isin([3])]

X_D = df_D[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
       'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
       'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
       'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
       'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
       'purpose_9', 'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
       'sub_grade_D5']].copy()
    
y_D = df_D['loan_status']
    
X_train_D, X_test_D, y_train_D, y_test_D = train_test_split(X_D, y_D, test_size=0.2, random_state=27, shuffle=True)

In [142]:
logistic_D = LogisticRegression(C=1e-04, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_D.fit(X_train_D, y_train_D)



LogisticRegression(C=0.0001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [143]:
print('accuracy: ', logistic_D.score(X_test_D, y_test_D))

predictions_D = logistic_D.predict(X_test_D)
cm_D = confusion_matrix(y_test_D, predictions_D)
print(cm_D)

accuracy:  0.5435165265010757
[[6337 5368]
 [1634 2000]]


## EFG loans are better predicted by group model instead of individual

In [144]:
# df_E = df[df['grade'].isin([4])]

# X_E = df_E[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
#        'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
#        'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
#        'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
#        'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
#        'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
#        'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
#        'purpose_9', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
#        'sub_grade_E4', 'sub_grade_E5']].copy()
    
# y_E = df_E['loan_status']

# X_train_E, X_test_E, y_train_E, y_test_E = train_test_split(X_E, y_E, test_size=0.2, random_state=27, shuffle=True)

In [151]:
# logistic_E = LogisticRegression(C=1e-05, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_E.fit(X_train_E, y_train_E)



LogisticRegression(C=1e-05, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [152]:
# print('accuracy: ', logistic_E.score(X_test_E, y_test_E))

# predictions_E = logistic_E.predict(X_test_E)
# cm_E = confusion_matrix(y_test_E, predictions_E)
# print(cm_E)

accuracy:  0.4886701762417029
[[1372 1719]
 [ 515  763]]


In [153]:
# df_F = df[df['grade'].isin([5])]

# X_F = df_F[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
#        'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
#        'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
#        'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
#        'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
#        'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
#        'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
#        'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
#        'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
#        'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
#        'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
#        'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
#        'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
#        'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
#        'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
#        'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
# y_F = df_F['loan_status']
    
# X_train_F, X_test_F, y_train_F, y_test_F = train_test_split(X_F, y_F, test_size=0.2, random_state=27, shuffle=True)

In [154]:
# logistic_F = LogisticRegression(C=1e-05, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_F.fit(X_train_F, y_train_F)



LogisticRegression(C=1e-05, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [155]:
# print('accuracy: ', logistic_F.score(X_test_E, y_test_E))

# predictions_F = logistic_F.predict(X_test_F)
# cm_F = confusion_matrix(y_test_F, predictions_F)
# print(cm_F)

accuracy:  0.4827191577019913
[[311 296]
 [109 178]]


In [156]:
# df_G = df[df['grade'].isin([6])]

# X_G = df_G[['int_rate', 'loan_amnt', 'issue_month', 'annual_inc', 'dti', 'emp_length',
#        'int_rate', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
#        'tax_liens', 'revol_bal', 'delinq_amnt', 'delinq_2yrs', 
#        'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
#        'ownership_0', 'ownership_1', 'ownership_2', 'purpose_0', 'purpose_1',
#        'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_2',
#        'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8',
#        'purpose_9', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
#        'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
#        'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
#        'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
#        'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
#        'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
#        'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
#        'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
#        'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5']].copy()
    
# y_G = df_G['loan_status']
    
# X_train_G, X_test_G, y_train_G, y_test_G = train_test_split(X_G, y_G, test_size=0.2, random_state=27, shuffle=True)

In [157]:
# logistic_G = LogisticRegression(C=1e-05, l1_ratio=0.3, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_G.fit(X_train_G, y_train_G)



LogisticRegression(C=1e-05, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.3,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [158]:
# print('accuracy: ', logistic_G.score(X_test_G, y_test_G))

# predictions_G = logistic_G.predict(X_test_G)
# cm_G = confusion_matrix(y_test_G, predictions_G)
# print(cm_G)

accuracy:  0.5869565217391305
[[70 16]
 [41 11]]
