In [1]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd 
pd.set_option('display.max_columns', None)

from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance

import statsmodels.api as sm 

In [2]:
df = pd.read_csv('../../cleaned_accepted_2007_to_2015Q4.csv.gz')
df.drop(columns='Unnamed: 0', inplace=True)

In [3]:
df = df[df['term'] = ' 36 months']

In [4]:
df.head()

Unnamed: 0,term,grade,issue_month,issue_year,purpose,annual_inc,dti,emp_length,funded_amnt,inq_fi,inq_last_12m,installment,int_rate,loan_status,loan_amnt,open_acc,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,tax_liens,total_acc,revol_bal,delinq_amnt,delinq_2yrs,mths_since_last_delinq,mths_since_last_major_derog,cr_yeargap,avg_fico_score,home_ownership_cat,application_type_cat,purpose_cat,sub_grade_cat,home_ownership
0,36 months,C,12.0,2015.0,0,55000.0,5.91,10,3600.0,3.0,4.0,123.03,13.99,1,3600.0,7.0,0.0,0.0,0.0,0.0,13.0,2765.0,0.0,0.0,30.0,30.0,12.0,677.0,1,0,2,13,0
1,36 months,C,12.0,2015.0,5,65000.0,16.06,10,24700.0,0.0,6.0,820.28,11.99,1,24700.0,22.0,7.7,0.0,0.0,0.0,38.0,21470.0,0.0,1.0,6.0,0.0,16.0,717.0,1,0,11,10,0
5,36 months,C,12.0,2015.0,0,34000.0,10.2,4,11950.0,0.0,0.0,405.18,13.44,1,11950.0,5.0,100.0,0.0,0.0,0.0,6.0,8822.0,0.0,0.0,0.0,0.0,28.0,692.0,5,0,2,12,1
6,36 months,B,12.0,2015.0,0,180000.0,14.67,10,20000.0,1.0,1.0,637.58,9.17,1,20000.0,12.0,100.0,0.0,0.0,0.0,27.0,87329.0,0.0,0.0,49.0,0.0,26.0,682.0,1,0,2,6,0
7,36 months,B,12.0,2015.0,4,85000.0,17.61,10,20000.0,1.0,2.0,631.26,8.49,1,20000.0,8.0,0.0,0.0,0.0,0.0,15.0,826.0,0.0,1.0,3.0,3.0,17.0,707.0,1,0,6,5,0


In [5]:
df.loan_status.value_counts()

1    534414
0     86751
Name: loan_status, dtype: int64

In [6]:
df.isnull().sum()

term                           0
grade                          0
issue_month                    0
issue_year                     0
purpose                        0
annual_inc                     0
dti                            0
emp_length                     0
funded_amnt                    0
inq_fi                         0
inq_last_12m                   0
installment                    0
int_rate                       0
loan_status                    0
loan_amnt                      0
open_acc                       0
percent_bc_gt_75               0
pub_rec                        0
pub_rec_bankruptcies           0
tax_liens                      0
total_acc                      0
revol_bal                      0
delinq_amnt                    0
delinq_2yrs                    0
mths_since_last_delinq         0
mths_since_last_major_derog    0
cr_yeargap                     0
avg_fico_score                 0
home_ownership_cat             0
application_type_cat           0
purpose_ca

In [7]:
df.columns

Index(['term', 'grade', 'issue_month', 'issue_year', 'purpose', 'annual_inc',
       'dti', 'emp_length', 'funded_amnt', 'inq_fi', 'inq_last_12m',
       'installment', 'int_rate', 'loan_status', 'loan_amnt', 'open_acc',
       'percent_bc_gt_75', 'pub_rec', 'pub_rec_bankruptcies', 'tax_liens',
       'total_acc', 'revol_bal', 'delinq_amnt', 'delinq_2yrs',
       'mths_since_last_delinq', 'mths_since_last_major_derog', 'cr_yeargap',
       'avg_fico_score', 'home_ownership_cat', 'application_type_cat',
       'purpose_cat', 'sub_grade_cat', 'home_ownership'],
      dtype='object')

In [8]:
# features annual_inc and revol_balance seemed to have a very bad effect

X = df[['grade', 'purpose', 'annual_inc', 'dti',
       'emp_length', 'funded_amnt', 'inq_fi', 'inq_last_12m', 'installment',
       'int_rate', 'loan_amnt', 'open_acc', 'percent_bc_gt_75',
       'pub_rec', 'pub_rec_bankruptcies', 'tax_liens', 'total_acc',
       'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq',
       'mths_since_last_major_derog', 'cr_yeargap', 'avg_fico_score',
       'home_ownership_cat', 'application_type_cat', 'purpose_cat',
       'sub_grade_cat', 'home_ownership']].copy()
    
y = df['loan_status']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, shuffle=True)

In [9]:
# # Grid search cross validation
# grid={'C':[1e-5, 1e-4, 1e-3, 1e-2], 'l1_ratio':[0, 0.1, 0.2]}

# logistic = LogisticRegression(solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_cv = GridSearchCV(logistic, grid, cv=5)
# logistic_cv.fit(X_train, y_train)

# print('Tuned hyperparameters : (best parameters) ', logistic_cv.best_params_)
# print('accuracy: ', logistic_cv.best_score_)


In [10]:
# # Grid search cross validation
# grid={'C':[1e-5, 1e-4, 1e-3, 1e-2], 'l1_ratio':[0, 0.1, 0.2]}

# logistic = LogisticRegression(solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_cv = GridSearchCV(logistic, grid, cv=5)
# logistic_cv.fit(X_train, y_train)

# print('Tuned hyperparameters : (best parameters) ', logistic_cv.best_params_)
# print('accuracy: ', logistic_cv.best_score_)


In [11]:
logistic = LogisticRegression(C=1e-05, l1_ratio=0.2, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic.fit(X_train, y_train)




LogisticRegression(C=1e-05, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.2,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [12]:
print('accuracy: ', logistic.score(X_test, y_test))

predictions = logistic.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)


accuracy:  0.6551399386636401
[[ 6860 10611]
 [32232 74530]]


In [13]:
print(logistic.coef_)

[[ 0.00000000e+00  1.70077421e-06 -4.11037485e-07 -2.70527363e-08
  -6.87116253e-06  0.00000000e+00  0.00000000e+00 -2.24062459e-06
  -4.16917709e-07 -7.00355420e-06 -1.15661055e-07 -1.32476067e-06
   0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.28455890e-07
  -9.29631205e-07  0.00000000e+00 -2.62751143e-07 -3.65794151e-07
   0.00000000e+00 -4.74790881e-06 -6.01944711e-08  0.00000000e+00
  -4.29736538e-08 -5.61897345e-07  0.00000000e+00  1.70077421e-06]]


In [14]:
for i, c in enumerate(X.columns):
    print(c, logistic.coef_[:,i])
    

purpose [0.]
annual_inc [1.70077421e-06]
dti [-4.11037485e-07]
emp_length [-2.70527363e-08]
funded_amnt [-6.87116253e-06]
inq_fi [0.]
inq_last_12m [0.]
installment [-2.24062459e-06]
int_rate [-4.16917709e-07]
loan_amnt [-7.0035542e-06]
open_acc [-1.15661055e-07]
percent_bc_gt_75 [-1.32476067e-06]
pub_rec [0.]
pub_rec_bankruptcies [0.]
tax_liens [0.]
total_acc [-1.2845589e-07]
delinq_amnt [-9.29631205e-07]
delinq_2yrs [0.]
mths_since_last_delinq [-2.62751143e-07]
mths_since_last_major_derog [-3.65794151e-07]
cr_yeargap [0.]
avg_fico_score [-4.74790881e-06]
home_ownership_cat [-6.01944711e-08]
application_type_cat [0.]
purpose_cat [-4.29736538e-08]
sub_grade_cat [-5.61897345e-07]
home_ownership [0.]
annual_inc [1.70077421e-06]


In [15]:
r = permutation_importance(logistic, X_test, y_test,
                          n_repeats=30,
                          random_state=27)

for i in r.importances_mean.argsort()[::-1]:
    if abs(r.importances_mean[i]) - 2*r.importances_std[i] > 0:
        print(f"{X.columns[i]:}"
             f"{r.importances_mean[i]:.3f}"
             f" +/- {r.importances_std[i]:.3f}"
             )

annual_inc0.047 +/- 0.001
annual_inc0.047 +/- 0.001
loan_amnt0.011 +/- 0.001
funded_amnt0.011 +/- 0.001
purpose_cat0.000 +/- 0.000
emp_length0.000 +/- 0.000


### scale features

In [16]:
# # Scale X
# scaler_X = StandardScaler().fit(X)
# X_scaled = scaler_X.transform(X)
# X_sca = pd.DataFrame(data=X_scaled, index=X.index, columns=X.columns)


In [17]:
# X_sca_train, X_sca_test, y_sca_train, y_sca_test = train_test_split(X_sca, y, test_size=0.2, random_state=27, shuffle=True)

# logistic_sca = LogisticRegression(C=1e-05, l1_ratio=0.2, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
# logistic_sca.fit(X_sca_train, y_sca_train)


In [18]:
# print('accuracy: ', logistic_sca.score(X_test, y_test))

# predictions = logistic_sca.predict(X_test)
# cm = confusion_matrix(y_sca_test, predictions)
# print(cm)

In [19]:
# print(logistic_sca.coef_)


## Cut some features

In [None]:
# X_cut = df[['issue_year', 'loan_amnt', 'funded_amnt', 'installment', 'sub_grade_cat', 'int_rate',
#             'percent_bc_gt_75', 'avg_fico_score', 'mths_since_last_delinq', 'mths_since_last_major_derog', 
#             'cr_yeargap', 'dti', 'emp_length', 'inq_last_12m', 'open_acc', 'total_acc', 'delinq_amnt']]


In [23]:
#dti
X_cut = df[['loan_amnt', 'funded_amnt', 'installment', 'sub_grade_cat', 'int_rate',
            'avg_fico_score', 'mths_since_last_delinq', 'mths_since_last_major_derog', 
            'cr_yeargap', 'dti', 'emp_length', 'inq_last_12m']]

y = df['loan_status']
    
X_cut_train, X_cut_test, y_cut_train, y_cut_test = train_test_split(X_cut, y, test_size=0.2, random_state=27, shuffle=True)

In [24]:
logistic_cut = LogisticRegression(C=1e-05, l1_ratio=0.2, solver='saga', multi_class='auto', penalty='elasticnet', class_weight='balanced', random_state=27, n_jobs=7)
logistic_cut.fit(X_cut_train, y_cut_train)



LogisticRegression(C=1e-05, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=0.2,
                   max_iter=100, multi_class='auto', n_jobs=7,
                   penalty='elasticnet', random_state=27, solver='saga',
                   tol=0.0001, verbose=0, warm_start=False)

In [25]:
print('train accuracy: ', logistic_cut.score(X_cut_train, y_cut_train))
print('test accuracy: ', logistic_cut.score(X_cut_test, y_cut_test))

predictions_cut = logistic_cut.predict(X_cut_test)
cm_cut = confusion_matrix(y_cut_test, predictions_cut)
print(cm_cut)

train accuracy:  0.6091014464755741
test accuracy:  0.6036640828121352
[[10921  6550]
 [42688 64074]]
