# Coding and interpreting logistic regression

In [8]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import roc_auc_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import math

credit = pd.read_csv("credit_regress.csv")
credit.rename(columns = {'VAR1_A14' : 'No_account', 'VAR2' : 'Duration'}, inplace=True)

train, test = train_test_split(credit, test_size=0.3, random_state=2)

formula = 'Bad_2 ~ No_account + Duration'

logreg = smf.glm(formula=formula, data=train, family=sm.families.Binomial()).fit()
print('Parameters')
print(logreg.params)
print('Odds Ratios')
print(np.exp(logreg.params))

Parameters
Intercept    -1.112136
No_account   -1.578525
Duration      0.033985
dtype: float64
Odds Ratios
Intercept     0.328856
No_account    0.206279
Duration      1.034569
dtype: float64


Parameters give the effects on logit. Odds ratios give the effects on odds. Intercept is of little interest usually.

Not having a checking account decreases logit by -1.58 as compared to having it. It also means that Odds are reduced by a factor of 0.21. In other words, Odds of Default/Bad are almost 5 times lower for those without a checking account.

For Duration an increase of one month (one unit) means an increase in logit of 0.03. Or an increase in odds by a factor of 1.03. A convenient way of interpreting the effect on odds for a numeric variable is to think of percentage deviation from 1, i.e. (exp(beta) - 1) x 100 indicates the percentage increase or decrease due to a one-unit change in the predictor. So for Duration, a one month increase leads to a 3% increase in odds. Therefore, loans with longer duration are higher risks.

In [15]:
# Let's check the predictive accuracy on a test sample:
# AUC, recall (correctly predicted Bads out of all true Bads), 
# precision (correctly predicted Bads out of all predicted as Bads).
# In credit scoring, we are more interested in correctly predicting Bads, hence the choice of measures

prob_test = logreg.predict(test[['No_account', 'Duration']])
pred = [1 if x > 0.5 else 0 for x in prob_test] # 0.5 cutoff is a default setting in all software

roc_auc = roc_auc_score(y_true=test['Bad_2'], y_score=prob_test)
recall = recall_score(y_true=test['Bad_2'], y_pred=pred)
precision = precision_score(y_true=test['Bad_2'], y_pred=pred)

print ('AUC for 50% holdout:'+str(roc_auc))
print ("Recall for 50% holdout:" +str(recall))
print ("Precision for 50% holdout:" +str(precision))

AUC for 30% holdout:0.7712950399517564
Recall for 30% holdout:0.31313131313131315
Precision for 30% holdout:0.6458333333333334


In [16]:
# To check how predictive accuracy varies for different samples one can use cross_validate module 
# You need to fit logistic using a different module, since cross_validate does not accept statsmodels
# In credit modelling, one often compares predictive accuracy between test and train set
# Large differences would indicate overfitting
# I will use a default setting of cv=3 
# This is because of a relatively small sample with only 300 events (Bads)
# cross_validate cv:int - to specify the number of folds in a (Stratified)KFold

from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

X = credit[['No_account', 'Duration']]
Y = credit['Bad_2']

classifier = LogisticRegression(solver='liblinear')
metrics = ['roc_auc','recall','precision']

outcomes = cross_validate(classifier, X, Y, scoring=metrics, cv=3, return_train_score=True)

for metric in outcomes.keys():
    print(metric+' value: '+str(outcomes[metric]))

fit_time value: [0.01500297 0.00300193 0.0030005 ]
score_time value: [0.01400065 0.00699854 0.00699854]
test_roc_auc value: [0.74431624 0.73100858 0.7366309 ]
train_roc_auc value: [0.73497854 0.74066381 0.73784797]
test_recall value: [0.28 0.17 0.26]
train_recall value: [0.215 0.27  0.225]
test_precision value: [0.60869565 0.5862069  0.65      ]
train_precision value: [0.62318841 0.62790698 0.6       ]
