# Setup

## Library imports

In [11]:
# !pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 


## Read in data

### Define columns considered

In [8]:
REQUIRED_COLUMNS = [
    "ORIG_RATE",
    "ORIG_AMOUNT",
    "ORIG_TERM",
    "OCLTV",
    "NUM_BO",
    "DTI",
    "CSCORE_B",
    "CSCORE_C",
    "FTHB_FLAG",
    "PUR_Cash_out",
    "PUR_Refinance",
    "PUR_Purchase",
    "PRO_Condominium",
    "PRO_Co_operative",
    "PRO_Planned_Urban",
    "PRO_Manufact_Home",
    "PRO_Single_Family",
    "NUM_UNIT",
    "OCC_Principal",
    "OCC_Second",
    "OCC_Investor",
    "MI_PCT",
    "MI_Borrower",
    "MI_Lender",
    "MI_Investor",
    "DLQ_FLAG"
]

#### Read and process dataframe 

In [9]:
#### Read in processed dataset
complete_processed_dataset = pd.read_csv("../../../2023_stat.csv")
model_data = complete_processed_dataset[REQUIRED_COLUMNS]

#### Deal with datetime cols "ORIG_DTE","FRST_DTE"
model_data["ORIG_DTE_YR"] = pd.to_datetime(complete_processed_dataset["ORIG_DTE"]).dt.year - 2000
model_data["ORIG_DTE_MONTH"] = pd.to_datetime(complete_processed_dataset["ORIG_DTE"]).dt.month
model_data["FRST_DTE_YR"] = pd.to_datetime(complete_processed_dataset["FRST_DTE"]).dt.year - 2000
model_data["FRST_DTE_MONTH"] = pd.to_datetime(complete_processed_dataset["FRST_DTE"]).dt.month

#### Fill in missing data
print(model_data.isna().sum())
for col in ["DTI", "CSCORE_B", "CSCORE_C"]: #  this should be dealt in the pre-processing
    col_median=model_data[col].median()
    model_data[col] = model_data[col].fillna(col_median)
#### Check that no more NA values are remaining
print(sum(model_data.isna().sum()))

#### Set X, Y matrices
Y = model_data['DLQ_FLAG']
X = model_data.drop(columns=['DLQ_FLAG'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["ORIG_DTE_YR"] = pd.to_datetime(complete_processed_dataset["ORIG_DTE"]).dt.year - 2000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["ORIG_DTE_MONTH"] = pd.to_datetime(complete_processed_dataset["ORIG_DTE"]).dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data["

ORIG_RATE               0
ORIG_AMOUNT             0
ORIG_TERM               0
OCLTV                   0
NUM_BO                  0
DTI                    37
CSCORE_B             1552
CSCORE_C              971
FTHB_FLAG               0
PUR_Cash_out            0
PUR_Refinance           0
PUR_Purchase            0
PRO_Condominium         0
PRO_Co_operative        0
PRO_Planned_Urban       0
PRO_Manufact_Home       0
PRO_Single_Family       0
NUM_UNIT                0
OCC_Principal           0
OCC_Second              0
OCC_Investor            0
MI_PCT                  0
MI_Borrower             0
MI_Lender               0
MI_Investor             0
DLQ_FLAG                0
ORIG_DTE_YR             0
ORIG_DTE_MONTH          0
FRST_DTE_YR             0
FRST_DTE_MONTH          0
dtype: int64
0


# Train/Test Split

In [13]:
COLUMNS_TO_STANDARDIZE = [
    "ORIG_RATE",
    "ORIG_AMOUNT",
    "ORIG_TERM",
    "OCLTV",
    "NUM_BO",
    "DTI",
    "CSCORE_B",
    "CSCORE_C",
    "NUM_UNIT",
    "ORIG_DTE_YR",
    "ORIG_DTE_MONTH",
    "FRST_DTE_YR",
    "FRST_DTE_MONTH"
]
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X


Unnamed: 0,ORIG_RATE,ORIG_AMOUNT,ORIG_TERM,OCLTV,NUM_BO,DTI,CSCORE_B,CSCORE_C,FTHB_FLAG,PUR_Cash_out,...,OCC_Second,OCC_Investor,MI_PCT,MI_Borrower,MI_Lender,MI_Investor,ORIG_DTE_YR,ORIG_DTE_MONTH,FRST_DTE_YR,FRST_DTE_MONTH
0,0.618632,0.093135,1.0,0.563107,0.00,0.564516,0.891008,0.880240,0.0,0.0,...,0.0,1.0,0.000,0.0,0.0,0.0,1.000000,0.000000,0.5,0.181818
1,0.601164,0.072012,1.0,0.466019,0.00,0.645161,0.752044,0.727545,0.0,1.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.666667,1.000000,0.5,0.090909
2,0.781659,0.093135,1.0,0.708738,0.25,0.403226,0.896458,0.907186,0.0,0.0,...,0.0,1.0,0.000,0.0,0.0,0.0,0.666667,1.000000,0.5,0.090909
3,0.655022,0.167547,1.0,0.805825,0.00,0.758065,0.866485,0.853293,1.0,0.0,...,0.0,0.0,0.300,1.0,0.0,0.0,0.666667,1.000000,0.5,0.090909
4,0.691412,0.162266,1.0,0.466019,0.25,0.580645,0.405995,0.502994,0.0,0.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.666667,1.000000,0.5,0.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970181,0.344978,0.193471,1.0,0.757282,0.25,0.580645,0.754768,0.799401,1.0,0.0,...,0.0,0.0,0.000,0.0,0.0,0.0,1.000000,0.909091,1.0,0.000000
970182,0.708879,0.070571,1.0,0.640777,0.00,0.483871,0.910082,0.901198,1.0,0.0,...,0.0,0.0,0.000,0.0,0.0,0.0,1.000000,0.909091,1.0,0.000000
970183,0.764192,0.342775,1.0,0.582524,0.00,0.790323,0.891008,0.880240,0.0,0.0,...,0.0,0.0,0.000,0.0,0.0,0.0,1.000000,0.818182,0.5,1.000000
970184,0.746725,0.152664,1.0,0.922330,0.25,0.709677,0.678474,0.685629,1.0,0.0,...,0.0,0.0,0.875,1.0,0.0,0.0,1.000000,0.909091,1.0,0.000000


In [14]:
split = 0.7
X_train, Y_train = X[:int(len(X)*split)], Y[:int(len(X)*split)]
X_test, Y_test = X[int(len(X)*split):], Y[int(len(X)*split):]

# Model fitting

In [15]:
def fit_logistic_regression(
    train: list[pd.DataFrame, pd.DataFrame],
    test: list[pd.DataFrame, pd.DataFrame],
    model_config: dict = {}
):
    logistic_model = linear_model.LogisticRegression(
        C = model_config.get("C", 1),
        solver=model_config.get("solver", "liblinear"),
        random_state=model_config.get("random_state", 0),
        class_weight=model_config.get("class_weight", None)
    )
    logistic_model.fit(train[0], train[1])

    print("Training accuracy:", logistic_model.score(train[0], train[1]))
    print("Testing accuracy:", logistic_model.score(test[0], test[1]))
    print(
        classification_report(
            y_pred=logistic_model.predict(test[0]),
            y_true=test[1]
        )
    )

    return logistic_model
    

## Sanity checking

In [16]:
X_small, Y_small = X[:int(0.001*len(X))], Y[:int(0.001*len(Y))]
fit_logistic_regression(
    [X_small, Y_small],
    [X_small, Y_small],
    model_config={
        "class_weight": {
            0 :  len(Y_small)/(2*(len(Y_small)-sum(Y_small))),
            1 :  len(Y_small)/(2* sum(Y_small))
        }
    }
)

Training accuracy: 0.6731958762886598
Testing accuracy: 0.6731958762886598
              precision    recall  f1-score   support

           0       0.97      0.67      0.80       915
           1       0.11      0.65      0.19        55

    accuracy                           0.67       970
   macro avg       0.54      0.66      0.49       970
weighted avg       0.92      0.67      0.76       970



## Evaluate model

In [17]:
# Weighted classes
fit_logistic_regression(
    [X, Y],
    [X_test, Y_test],
    model_config={
        "class_weight": {
            0 :  len(Y_train)/(2*(len(Y_train)-sum(Y_train))),
            1 :  len(Y_train)/(2* sum(Y_train))
        }
    }
)

Training accuracy: 0.7386037316555795
Testing accuracy: 0.8699322467154087
              precision    recall  f1-score   support

           0       0.99      0.88      0.93    285599
           1       0.05      0.32      0.08      5457

    accuracy                           0.87    291056
   macro avg       0.52      0.60      0.51    291056
weighted avg       0.97      0.87      0.91    291056



In [18]:
## SMOTE
oversample = SMOTE()
X_train_smote, Y_train_smote = oversample.fit_resample(X_train, Y_train)
fit_logistic_regression(
    [X_train_smote, Y_train_smote],
    [X_test, Y_test],
    # model_config={"C" : 100,}
)

Training accuracy: 0.6613473960359041
Testing accuracy: 0.8378456379528338
              precision    recall  f1-score   support

           0       0.99      0.85      0.91    285599
           1       0.04      0.37      0.08      5457

    accuracy                           0.84    291056
   macro avg       0.51      0.61      0.49    291056
weighted avg       0.97      0.84      0.90    291056

