## Mortgage loan default using Logistic Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.metrics import classification_report
# Load the dataset
data_v1=pd.read_excel(r"C:\Users\Jesvika\OneDrive\Mortgage as of Dec'22_230823.xlsx")
data=data_v1.drop(['DEFAULT', 'GENDER_1',"JANGKA_WKT"], axis=1)

# Display basic information about the dataset
print(data.info())

# Check for missing values
print(data.isnull().sum())

# Explore the distribution of the target variable
print(data['MORTGAGE_COLLECTABILITY'].value_counts())

# Explore the distribution of numerical features
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21861 entries, 0 to 21860
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   MORTGAGE_COLLECTABILITY  21861 non-null  int64  
 1   GENDER                   21861 non-null  int64  
 2   AGE                      21861 non-null  int64  
 3   OCCUPATION               21861 non-null  int64  
 4   MARITAL_STATUS           21861 non-null  int64  
 5   INTEREST_RATE            21861 non-null  float64
 6   LOAN_TENURE              21861 non-null  int64  
 7   LOAN_PLAFOND             21861 non-null  float64
 8   DTI                      21861 non-null  float64
 9   LTV_RATIO                21861 non-null  float64
dtypes: float64(4), int64(6)
memory usage: 1.7 MB
None
MORTGAGE_COLLECTABILITY    0
GENDER                     0
AGE                        0
OCCUPATION                 0
MARITAL_STATUS             0
INTEREST_RATE              0
LOAN_TENURE   

In [2]:
data.head()

Unnamed: 0,MORTGAGE_COLLECTABILITY,GENDER,AGE,OCCUPATION,MARITAL_STATUS,INTEREST_RATE,LOAN_TENURE,LOAN_PLAFOND,DTI,LTV_RATIO
0,0,0,35,1,0,0.07,60,4948290000.0,0.5234,0.7
1,0,1,38,1,1,0.06,36,2820144000.0,0.1583,0.7
2,0,1,38,1,1,0.0862,36,2066854000.0,0.3784,0.7
3,0,1,43,0,1,0.07,180,2625150000.0,0.6638,0.8
4,0,1,56,1,1,0.06,36,3196770000.0,0.2947,0.7


In [3]:
borrower = data[["GENDER","AGE","OCCUPATION","MARITAL_STATUS","DTI"]]
loan = data[["INTEREST_RATE","LOAN_TENURE","LOAN_PLAFOND","LTV_RATIO"]]

In [4]:
#calculate Variance Inflation Factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_scores = pd.DataFrame() 
vif_scores["Variables"] = data.columns 
  
# calculating VIF for each feature 
vif_scores["VIF Scores"] = [variance_inflation_factor(data.values, i) for i in range(len(data.columns))] 
  
display(vif_scores)

Unnamed: 0,Variables,VIF Scores
0,MORTGAGE_COLLECTABILITY,1.054859
1,GENDER,3.131424
2,AGE,22.26399
3,OCCUPATION,1.751756
4,MARITAL_STATUS,4.586601
5,INTEREST_RATE,12.689077
6,LOAN_TENURE,8.238051
7,LOAN_PLAFOND,2.550284
8,DTI,7.856322
9,LTV_RATIO,14.352124


#### Borrower & Loan

In [5]:
# Transform AGE and LTV_RATIO using z-score
scaler = StandardScaler()
data["AGE_ZSCORE"] = scaler.fit_transform(data[["AGE"]])
data["LTV_RATIO_ZSCORE"] = scaler.fit_transform(data[["LTV_RATIO"]])
data["INTEREST_RATE_ZSCORE"] = scaler.fit_transform(data[["INTEREST_RATE"]])

# Separate features and target variable
X = data.drop(["MORTGAGE_COLLECTABILITY", "AGE", "LTV_RATIO","INTEREST_RATE"], axis=1)
y = data["MORTGAGE_COLLECTABILITY"]

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Create and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4241
           1       0.00      0.00      0.00       132

    accuracy                           0.97      4373
   macro avg       0.48      0.50      0.49      4373
weighted avg       0.94      0.97      0.95      4373



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# Add constant to the features matrix
X_train_with_const = sm.add_constant(X_train_resampled)

# Create a logistic regression model using statsmodels
stats_model = sm.Logit(y_train_resampled, X_train_with_const)
stats_results = stats_model.fit()

# Print summary of stats model
print(stats_results.summary())

Optimization terminated successfully.
         Current function value: 0.596742
         Iterations 5
                              Logit Regression Results                             
Dep. Variable:     MORTGAGE_COLLECTABILITY   No. Observations:                33948
Model:                               Logit   Df Residuals:                    33938
Method:                                MLE   Df Model:                            9
Date:                     Sun, 10 Sep 2023   Pseudo R-squ.:                  0.1391
Time:                             01:13:01   Log-Likelihood:                -20258.
converged:                            True   LL-Null:                       -23531.
Covariance Type:                 nonrobust   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.8778      0.052     1

In [7]:
# Uji Wald (F-test) untuk signifikansi keseluruhan model
wald_test = stats_results.wald_test(np.eye(X_train_with_const.shape[1]))
print("Wald Test (F-test) for Model Significance:")
print(wald_test)

# Hitung R-squared (McFadden's Pseudo R-squared)
rsquared = stats_results.prsquared
print("\nMcFadden's Pseudo R-squared:")
print(rsquared)

Wald Test (F-test) for Model Significance:
<Wald test (chi2): statistic=[[5069.30205426]], p-value=0.0, df_denom=9>

McFadden's Pseudo R-squared:
0.13908255973196892




In [8]:
#calculate Variance Inflation Factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_scores = pd.DataFrame() 
vif_scores["Variables"] = X.columns 
  
# calculating VIF for each feature 
vif_scores["VIF Scores"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))] 
  
display(vif_scores)

Unnamed: 0,Variables,VIF Scores
0,GENDER,2.984131
1,OCCUPATION,1.634855
2,MARITAL_STATUS,4.136503
3,LOAN_TENURE,5.963656
4,LOAN_PLAFOND,2.490896
5,DTI,6.036411
6,AGE_ZSCORE,1.36593
7,LTV_RATIO_ZSCORE,1.152799
8,INTEREST_RATE_ZSCORE,1.126261


#### Borrower

In [9]:
# Transform AGE and LTV_RATIO using z-score
scaler = StandardScaler()
data["AGE_ZSCORE"] = scaler.fit_transform(data[["AGE"]])

# Separate features and target variable
X = data[["GENDER", "AGE_ZSCORE", "OCCUPATION","MARITAL_STATUS","DTI"]]
y = data["MORTGAGE_COLLECTABILITY"]

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Create and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.61      0.75      4241
           1       0.03      0.43      0.06       132

    accuracy                           0.60      4373
   macro avg       0.50      0.52      0.40      4373
weighted avg       0.94      0.60      0.73      4373



In [10]:
# Add constant to the features matrix
X_train_with_const = sm.add_constant(X_train_resampled)

# Create a logistic regression model using statsmodels
stats_model = sm.Logit(y_train_resampled, X_train_with_const)
stats_results = stats_model.fit()

# Print summary of stats model
print(stats_results.summary())

Optimization terminated successfully.
         Current function value: 0.674210
         Iterations 4
                              Logit Regression Results                             
Dep. Variable:     MORTGAGE_COLLECTABILITY   No. Observations:                33948
Model:                               Logit   Df Residuals:                    33942
Method:                                MLE   Df Model:                            5
Date:                     Sun, 10 Sep 2023   Pseudo R-squ.:                 0.02732
Time:                             01:13:01   Log-Likelihood:                -22888.
converged:                            True   LL-Null:                       -23531.
Covariance Type:                 nonrobust   LLR p-value:                7.720e-276
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.1228      0.040      3.089      0.002  

In [11]:
# Uji Wald (F-test) untuk signifikansi keseluruhan model
wald_test = stats_results.wald_test(np.eye(X_train_with_const.shape[1]))
print("Wald Test (F-test) for Model Significance:")
print(wald_test)

# Hitung R-squared (McFadden's Pseudo R-squared)
rsquared = stats_results.prsquared
print("\nMcFadden's Pseudo R-squared:")
print(rsquared)

Wald Test (F-test) for Model Significance:
<Wald test (chi2): statistic=[[1217.76926277]], p-value=6.827017636812101e-260, df_denom=6>

McFadden's Pseudo R-squared:
0.027320869649393065




#### Loan

In [12]:
# Transform AGE and LTV_RATIO using z-score
scaler = StandardScaler()
data["LTV_RATIO_ZSCORE"] = scaler.fit_transform(data[["LTV_RATIO"]])
data["INTEREST_RATE_ZSCORE"] = scaler.fit_transform(data[["INTEREST_RATE"]])

# Separate features and target variable
X = data[["INTEREST_RATE_ZSCORE", "LOAN_TENURE", "LOAN_PLAFOND","LTV_RATIO_ZSCORE"]]
y = data["MORTGAGE_COLLECTABILITY"]

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4241
           1       0.00      0.00      0.00       132

    accuracy                           0.97      4373
   macro avg       0.48      0.50      0.49      4373
weighted avg       0.94      0.97      0.95      4373



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Add constant to the features matrix
X_train_with_const = sm.add_constant(X_train_resampled)

# Create a logistic regression model using statsmodels
stats_model = sm.Logit(y_train_resampled, X_train_with_const)
stats_results = stats_model.fit()

# Print summary of stats model
print(stats_results.summary())

Optimization terminated successfully.
         Current function value: 0.601520
         Iterations 5
                              Logit Regression Results                             
Dep. Variable:     MORTGAGE_COLLECTABILITY   No. Observations:                33948
Model:                               Logit   Df Residuals:                    33943
Method:                                MLE   Df Model:                            4
Date:                     Sun, 10 Sep 2023   Pseudo R-squ.:                  0.1322
Time:                             01:13:01   Log-Likelihood:                -20420.
converged:                            True   LL-Null:                       -23531.
Covariance Type:                 nonrobust   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.9614      0.033     2

In [14]:
# Uji Wald (F-test) untuk signifikansi keseluruhan model
wald_test = stats_results.wald_test(np.eye(X_train_with_const.shape[1]))
print("Wald Test (F-test) for Model Significance:")
print(wald_test)

# Hitung R-squared (McFadden's Pseudo R-squared)
rsquared = stats_results.prsquared
print("\nMcFadden's Pseudo R-squared:")
print(rsquared)

Wald Test (F-test) for Model Significance:
<Wald test (chi2): statistic=[[4856.69945142]], p-value=0.0, df_denom=4>

McFadden's Pseudo R-squared:
0.1321907251750487




## SMOTE

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score

# Load your dataset
data_v1=pd.read_excel(r"C:\Users\Jesvika\OneDrive\Mortgage as of Dec'22_230823.xlsx")
data=data_v1.drop(['DEFAULT', 'GENDER_1',"JANGKA_WKT"], axis=1)

# Transform AGE and LTV_RATIO using z-score
scaler = StandardScaler()
data["AGE_ZSCORE"] = scaler.fit_transform(data[["AGE"]])
data["LTV_RATIO_ZSCORE"] = scaler.fit_transform(data[["LTV_RATIO"]])
data["INTEREST_RATE_ZSCORE"] = scaler.fit_transform(data[["INTEREST_RATE"]])

# Separate features and target variable
X = data[["GENDER", "AGE_ZSCORE", "OCCUPATION", "MARITAL_STATUS", "DTI", "INTEREST_RATE_ZSCORE", "LOAN_TENURE", "LOAN_PLAFOND", "LTV_RATIO_ZSCORE"]]
y = data["MORTGAGE_COLLECTABILITY"]

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Create and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate AUC-ROC
y_pred_prob = model.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_prob)
print("AUC-ROC:", auc_roc)


Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      6367
           1       0.00      0.00      0.00       192

    accuracy                           0.97      6559
   macro avg       0.49      0.50      0.49      6559
weighted avg       0.94      0.97      0.96      6559

AUC-ROC: 0.5618353587508507


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Add constant to the features matrix
X_train_with_const = sm.add_constant(X_train_resampled)

# Create a logistic regression model using statsmodels
stats_model = sm.Logit(y_train_resampled, X_train_with_const)
stats_results = stats_model.fit()

# Print summary of stats model
print(stats_results.summary())

Optimization terminated successfully.
         Current function value: 0.524227
         Iterations 6
                              Logit Regression Results                             
Dep. Variable:     MORTGAGE_COLLECTABILITY   No. Observations:                29696
Model:                               Logit   Df Residuals:                    29686
Method:                                MLE   Df Model:                            9
Date:                     Sun, 10 Sep 2023   Pseudo R-squ.:                  0.2437
Time:                             01:13:04   Log-Likelihood:                -15567.
converged:                            True   LL-Null:                       -20584.
Covariance Type:                 nonrobust   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    2.5099      0.062     4

## GridSearchCV

In [17]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2']
}

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

# Create a stratified k-fold cross-validation generator
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='roc_auc')

# Fit the model
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and best AUC-ROC score
print("Best Parameters:", grid_search.best_params_)
print("Best AUC-ROC Score:", grid_search.best_score_)


Best Parameters: {'C': 0.001, 'penalty': 'l2'}
Best AUC-ROC Score: 0.5655640675741176


In [18]:
# Add constant to the features matrix
X_train_with_const = sm.add_constant(X_train_resampled)

# Create a logistic regression model using statsmodels
stats_model = sm.Logit(y_train_resampled, X_train_with_const)
stats_results = stats_model.fit()

# Print summary of stats model
print(stats_results.summary())

Optimization terminated successfully.
         Current function value: 0.524227
         Iterations 6
                              Logit Regression Results                             
Dep. Variable:     MORTGAGE_COLLECTABILITY   No. Observations:                29696
Model:                               Logit   Df Residuals:                    29686
Method:                                MLE   Df Model:                            9
Date:                     Sun, 10 Sep 2023   Pseudo R-squ.:                  0.2437
Time:                             01:13:06   Log-Likelihood:                -15567.
converged:                            True   LL-Null:                       -20584.
Covariance Type:                 nonrobust   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    2.5099      0.062     4

In [19]:
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model with class weights
# Set the 'class_weight' parameter to 'balanced' to automatically adjust weights
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Fit the model
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      6367
           1       0.00      0.00      0.00       192

    accuracy                           0.97      6559
   macro avg       0.49      0.50      0.49      6559
weighted avg       0.94      0.97      0.96      6559



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Add constant to the features matrix
X_train_with_const = sm.add_constant(X_train_resampled)

# Create a logistic regression model using statsmodels
stats_model = sm.Logit(y_train_resampled, X_train_with_const)
stats_results = stats_model.fit()

# Print summary of stats model
print(stats_results.summary())

Optimization terminated successfully.
         Current function value: 0.524227
         Iterations 6
                              Logit Regression Results                             
Dep. Variable:     MORTGAGE_COLLECTABILITY   No. Observations:                29696
Model:                               Logit   Df Residuals:                    29686
Method:                                MLE   Df Model:                            9
Date:                     Sun, 10 Sep 2023   Pseudo R-squ.:                  0.2437
Time:                             01:13:06   Log-Likelihood:                -15567.
converged:                            True   LL-Null:                       -20584.
Covariance Type:                 nonrobust   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    2.5099      0.062     4