# Setup

## Library imports

In [107]:
# !pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 


## Read in data

### Define columns considered

In [90]:
REQUIRED_COLUMNS = [
    "ORIG_RATE",
    "ORIG_AMOUNT",
    "ORIG_TERM",
    "OCLTV",
    "NUM_BO",
    "DTI",
    "CSCORE_B",
    "CSCORE_C",
    "FTHB_FLAG",
    "PUR_Cash_out",
    "PUR_Refinance",
    "PUR_Purchase",
    "PRO_Condominium",
    "PRO_Co_operative",
    "PRO_Planned_Urban",
    "PRO_Manufact_Home",
    "PRO_Single_Family",
    "NUM_UNIT",
    "OCC_Principal",
    "OCC_Second",
    "OCC_Investor",
    "MI_PCT",
    "MI_Borrower",
    "MI_Lender",
    "MI_Investor",
    "DLQ_FLAG"
]

#### Read and process dataframe 

In [96]:
#### Read in processed dataset
complete_processed_dataset = pd.read_csv("../loan_data/preprocessed_2023Q4_stat.csv")
model_data = complete_processed_dataset[REQUIRED_COLUMNS]

#### Deal with datetime cols "ORIG_DTE","FRST_DTE"
model_data["ORIG_DTE_YR"] = pd.to_datetime(complete_processed_dataset["ORIG_DTE"]).dt.year - 2000
model_data["ORIG_DTE_MONTH"] = pd.to_datetime(complete_processed_dataset["ORIG_DTE"]).dt.month
model_data["FRST_DTE_YR"] = pd.to_datetime(complete_processed_dataset["FRST_DTE"]).dt.year - 2000
model_data["FRST_DTE_MONTH"] = pd.to_datetime(complete_processed_dataset["FRST_DTE"]).dt.month

#### Fill in missing data
print(model_data.isna().sum())
for col in ["DTI", "CSCORE_B", "CSCORE_C"]: #  this should be dealt in the pre-processing
    col_median=model_data[col].median()
    model_data[col] = model_data[col].fillna(col_median)
#### Check that no more NA values are remaining
print(sum(model_data.isna().sum()))

#### Set X, Y matrices
Y = model_data['DLQ_FLAG']
X = model_data.drop(columns=['DLQ_FLAG'])

ORIG_RATE              0
ORIG_AMOUNT            0
ORIG_TERM              0
OCLTV                  0
NUM_BO                 0
DTI                    3
CSCORE_B             278
CSCORE_C             189
FTHB_FLAG              0
PUR_Cash_out           0
PUR_Refinance          0
PUR_Purchase           0
PRO_Condominium        0
PRO_Co_operative       0
PRO_Planned_Urban      0
PRO_Manufact_Home      0
PRO_Single_Family      0
NUM_UNIT               0
OCC_Principal          0
OCC_Second             0
OCC_Investor           0
MI_PCT                 0
MI_Borrower            0
MI_Lender              0
MI_Investor            0
DLQ_FLAG               0
ORIG_DTE_YR            0
ORIG_DTE_MONTH         0
FRST_DTE_YR            0
FRST_DTE_MONTH         0
dtype: int64
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["ORIG_DTE_YR"] = pd.to_datetime(complete_processed_dataset["ORIG_DTE"]).dt.year - 2000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["ORIG_DTE_MONTH"] = pd.to_datetime(complete_processed_dataset["ORIG_DTE"]).dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["

# Train/Test Split

In [100]:
split = 0.7
X_train, Y_train = X[:int(len(X)*split)], Y[:int(len(X)*split)]
X_test, Y_test = X[int(len(X)*split):], Y[int(len(X)*split):]

# Model fitting

In [124]:
def fit_logistic_regression(
    train: list[pd.DataFrame, pd.DataFrame],
    test: list[pd.DataFrame, pd.DataFrame],
    model_config: dict = {}
):
    logistic_model = linear_model.LogisticRegression(
        C = model_config.get("C", 1),
        solver=model_config.get("solver", "liblinear"),
        random_state=model_config.get("random_state", 0),
        class_weight=model_config.get("class_weight", None)
    )
    logistic_model.fit(train[0], train[1])

    print("Training accuracy:", logistic_model.score(train[0], train[1]))
    print("Testing accuracy:", logistic_model.score(test[0], test[1]))
    print(
        classification_report(
            y_pred=logistic_model.predict(test[0]),
            y_true=test[1]
        )
    )

    return logistic_model
    

## Sanity checking

In [125]:
X_small, Y_small = X[:int(0.001*len(X))], Y[:int(0.001*len(Y))]
fit_logistic_regression(
    [X_small, Y_small],
    [X_small, Y_small],
    model_config={
        "class_weight": {
            0 :  len(Y_small)/(2*(len(Y_small)-sum(Y_small))),
            1 :  len(Y_small)/(2* sum(Y_small))
        }
    }
)

Training accuracy: 0.9924812030075187
Testing accuracy: 0.9924812030075187
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       132
           1       0.50      1.00      0.67         1

    accuracy                           0.99       133
   macro avg       0.75      1.00      0.83       133
weighted avg       1.00      0.99      0.99       133



## Evaluate model

In [126]:
# Weighted classes
fit_logistic_regression(
    [X, Y],
    [X_test, Y_test],
    model_config={
        "class_weight": {
            0 :  len(Y_train)/(2*(len(Y_train)-sum(Y_train))),
            1 :  len(Y_train)/(2* sum(Y_train))
        }
    }
)

Training accuracy: 0.6824335467234375
Testing accuracy: 0.729142200344819
              precision    recall  f1-score   support

           0       0.99      0.73      0.84     39466
           1       0.03      0.54      0.05       555

    accuracy                           0.73     40021
   macro avg       0.51      0.63      0.45     40021
weighted avg       0.98      0.73      0.83     40021



In [127]:
## SMOTE
oversample = SMOTE()
X_train_smote, Y_train_smote = oversample.fit_resample(X_train, Y_train)
fit_logistic_regression(
    [X_train_smote, Y_train_smote],
    [X_test, Y_test],
)

Training accuracy: 0.9067306750269423
Testing accuracy: 0.9560230878788636
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     39466
           1       0.03      0.06      0.04       555

    accuracy                           0.96     40021
   macro avg       0.51      0.51      0.51     40021
weighted avg       0.97      0.96      0.96     40021

