## Mortgage Loans: Logistic Regression Example

In [36]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt
import pickle

In [117]:
df = pd.read_csv('../data/loan_data_set.csv')
df.shape

(614, 13)

In [118]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

## Exploratory Data Analysis

In [122]:
# declare the list of features
features = ['Credit_History',
            'LoanAmount',
            'Property_Area',
            'Education',
            'Loan_Amount_Term',
            'ApplicantIncome']

In [123]:
# recode missing values
print(df[features].isnull().sum())
for feature in ['LoanAmount','Loan_Amount_Term','ApplicantIncome']:
    df[feature].fillna(value=df[feature].mean(), inplace=True)
print(df[features].isnull().sum())

Credit_History      50
LoanAmount          22
Property_Area        0
Education            0
Loan_Amount_Term    14
ApplicantIncome      0
dtype: int64
Credit_History      50
LoanAmount           0
Property_Area        0
Education            0
Loan_Amount_Term     0
ApplicantIncome      0
dtype: int64


In [124]:
for feature in ['LoanAmount','Loan_Amount_Term','ApplicantIncome']:
    print(df[feature].agg(['mean', 'median', 'min', 'max']))

mean      146.412162
median    129.000000
min         9.000000
max       700.000000
Name: LoanAmount, dtype: float64
mean      342.0
median    360.0
min        12.0
max       480.0
Name: Loan_Amount_Term, dtype: float64
mean       5403.459283
median     3812.500000
min         150.000000
max       81000.000000
Name: ApplicantIncome, dtype: float64


In [125]:
# credit: missing data
print(df.shape)
print(df['Credit_History'].value_counts(dropna=False))
df.dropna(subset=['Credit_History'], inplace=True)
print(df.shape)

(614, 13)
1.0    475
0.0     89
NaN     50
Name: Credit_History, dtype: int64
(564, 13)


In [126]:
# recode the target variable as numeric
df['Loan_Approval']=np.where(df['Loan_Status']=="Y", 1, 0)
df['Loan_Approval'].value_counts(dropna=False)

1    385
0    179
Name: Loan_Approval, dtype: int64

In [130]:
df = pd.get_dummies(df, columns=['Property_Area', 'Education'], drop_first=True)

## Model Building

In [140]:
# specify X and y
features = ['Credit_History',
            'LoanAmount',
            'Loan_Amount_Term',
            'ApplicantIncome',
            'Property_Area_Semiurban',
            'Property_Area_Urban',
            'Education_Not Graduate']

y = df['Loan_Approval']
X = df[features]

In [141]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state=12)

In [142]:
# Fit the model to the training dataset
mymodel = LogisticRegression()
mymodel.fit(X_train, y_train)

In [143]:
# coefficients and intercept
mymodel.intercept_
mymodel.coef_

array([[ 2.84408411e+00, -3.76393001e-03, -4.13917828e-03,
         2.69171217e-05,  8.42101310e-01, -1.12250636e-02,
        -5.89143423e-01]])

In [144]:
# Predict the y-values on the testing dataset
y_preds = mymodel.predict(X_test)
y_probs = mymodel.predict_proba(X_test)

## Model Evalution

In [145]:
# Evaluate the model
print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       1.00      0.52      0.68        48
           1       0.80      1.00      0.89        93

    accuracy                           0.84       141
   macro avg       0.90      0.76      0.79       141
weighted avg       0.87      0.84      0.82       141



In [146]:
# save your eval report as an html file
report = metrics.classification_report(y_test, y_preds, output_dict=True)
evalreport = pd.DataFrame(report).transpose()
evalreport.to_html('../assets/evalreport.html')
evalreport

Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.520833,0.684932,48.0
1,0.801724,1.0,0.889952,93.0
accuracy,0.836879,0.836879,0.836879,0.836879
macro avg,0.900862,0.760417,0.787442,141.0
weighted avg,0.869222,0.836879,0.820158,141.0


In [147]:
# true positives, etc.
y_score = mymodel.predict_proba(X_test)[:, 1]
fpr, tpr, thresh=metrics.roc_curve(y_test, y_score)
roc_df=pd.DataFrame(zip(fpr, tpr, thresh), columns=['FPR','TPR','Threshold'])
roc_df.head()

Unnamed: 0,FPR,TPR,Threshold
0,0.0,0.0,1.950803
1,0.0,0.010753,0.950803
2,0.0,0.064516,0.921108
3,0.020833,0.064516,0.907111
4,0.020833,0.086022,0.899468


In [148]:
# pickle dataframe

filename = open('roc_df.pkl', 'wb')
pickle.dump(roc_df, filename)
filename.close()

roc_df.to_csv('roc_df.csv', index=False)

In [149]:
import plotly.express as px

def make_rocauc(i):
    nearest=roc_df.iloc[(roc_df['Threshold']-i).abs().argsort()[:1]]['Threshold'].values[0]
    q=roc_df[roc_df['Threshold']==nearest].index[0]
    print(nearest, q)
    fig = px.area(roc_df, x="FPR", y="TPR",
                  title=f'ROC Curve (AUC={metrics.auc(fpr, tpr):.3f})',
                  hover_data={'Threshold':':.2f',
                              'FPR':':.2f',
                              'TPR':':.2f',
                             },width=800, height=700)

    fig.add_annotation(x=roc_df.iloc[q][0], y=roc_df.iloc[q][1],
            text=f"Threshold nearest {i*100:.0f}% = {roc_df.iloc[q][2]:.2f}",
                showarrow=True,
                arrowhead=1)

    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    return fig
make_rocauc(.60)

0.6063159268252124 26


In [150]:
# display with plotly
import plotly.express as px
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={metrics.auc(fpr, tpr):.3f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.write_json('../assets/rocauc.json')
fig.show()

## Make predictions on new data

In [151]:
# check out one row of the test data
X_test.iloc[0]

Credit_History                1.0
LoanAmount                   17.0
Loan_Amount_Term            120.0
ApplicantIncome            1299.0
Property_Area_Semiurban       0.0
Property_Area_Urban           1.0
Education_Not Graduate        0.0
Name: 14, dtype: float64

In [152]:
# show a prediction & probability for that value
print(mymodel.predict([X_test.iloc[0].values])[0])
print(mymodel.predict_proba([X_test.iloc[0].values]).max())

1
0.9211080060461824



X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names



In [153]:
# pickle your model
import pickle
filename = open('loan_approval_logistic_model.pkl', 'wb')
pickle.dump(mymodel, filename)
filename.close()

In [154]:
# read in our pickle file
filename = open('loan_approval_logistic_model.pkl', 'rb')
unpickled_model = pickle.load(filename)
filename.close()

In [161]:
# make predictions on new data
fake1=[[1, 1000, 180, 100, 0,0,0]]
fake2=[[1, 300, 360, 4500, 0,0,0]]
fake3=[[0, 100, 360, 1000, 0,0,0]]

In [162]:
# make predictions
for data in [fake1, fake2, fake3]:
    y = unpickled_model.predict(data)
    func = lambda y: 'approved' if y[0]==1 else 'denied'
    formatted_y = func(y)
    prob=unpickled_model.predict_proba(data).max()*100
    formatted_prob = "{:,.2f}%".format(prob)
    print(y[0]==1)
    print(formatted_y)
    print(formatted_prob)

False
denied
81.93%
True
approved
62.16%
False
denied
84.41%



X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names



In [163]:
# change the threshold
Threshold=50
for data in [fake1, fake2, fake3]:
        rawprob=100*unpickled_model.predict_proba(data)[0][0]
        func = lambda y: 'Denied' if int(rawprob)>Threshold else 'Approved'
        formatted_y = func(y)
        print(rawprob)
        print(formatted_y)

81.93344615671408
Denied
37.84130831110328
Approved
84.41213831015475
Denied



X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names



In [164]:
# probability of 'denied'
print(unpickled_model.predict_proba(data)[0][0])
# probability of 'denied'
unpickled_model.predict_proba(data)[0][1]

0.8441213831015474



X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names



0.15587861689845264