# Setup

## Library imports

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


## Read in data

In [18]:
file_name = './../../../dataset/2021_stat.csv'
table = pd.read_csv(file_name, low_memory=False).dropna()
model_data = table[[ 'ORIG_RATE', 'ORIG_AMOUNT', 'ORIG_TERM', 'OLTV', 'NUM_BO', 'DTI', 'CSCORE_B','FTHB_FLAG', 'PUR_Cash_out', 'PUR_Refinance', 'PUR_Purchase', 'NUM_UNIT', 'OCC_Principal', 'OCC_Second', 'OCC_Investor', 'MI_PCT', 'DLQ_30_FLAG', 'DLQ_90_FLAG', 'Ongoing', 'Current_DLQ', 'Prepaid_Matured']]

# Definition of Bad Loans: once had a 30-day delinquency in performance history
# Definition of Good Loans: no delinquency and continuous payments up to current
model_data = model_data[ (model_data['DLQ_90_FLAG'] == 1) | ( (model_data['DLQ_90_FLAG'] == 0) & (model_data['Ongoing'] == 1) ) ]
model_data = model_data.astype(float)
X = model_data.drop(columns=['DLQ_30_FLAG', 'DLQ_90_FLAG', 'Ongoing', 'Current_DLQ', 'Prepaid_Matured'])
y = model_data['DLQ_90_FLAG']


num_col = ['ORIG_RATE', 'ORIG_AMOUNT', 'ORIG_TERM', 'OLTV', 'NUM_BO', 'DTI', 'CSCORE_B', # 'CSCORE_C',
           'NUM_UNIT', 'MI_PCT']
cat_col = ['FTHB_FLAG', 'PUR_Cash_out', 'PUR_Refinance', 'PUR_Purchase', 'OCC_Principal', 'OCC_Second', 'OCC_Investor']
scaler = StandardScaler()
X[num_col] = scaler.fit_transform(X[num_col])
X = pd.concat([X[num_col], X[cat_col]], axis=1)

X

Unnamed: 0,ORIG_RATE,ORIG_AMOUNT,ORIG_TERM,OLTV,NUM_BO,DTI,CSCORE_B,NUM_UNIT,MI_PCT,FTHB_FLAG,PUR_Cash_out,PUR_Refinance,PUR_Purchase,OCC_Principal,OCC_Second,OCC_Investor
0,-1.859686,2.297163,-1.684629,0.077201,-0.913967,1.449199,0.839393,-0.110612,-0.468491,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-0.679706,2.413375,0.613636,0.664640,1.019903,-0.059510,0.700780,-0.110612,-0.468491,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,-0.372911,2.953421,0.613636,1.305482,-0.913967,0.141651,0.908700,-0.110612,2.372512,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-0.679706,3.192681,0.613636,-2.058940,1.019903,-0.361252,0.723882,-0.110612,-0.468491,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,-0.962901,2.201459,0.613636,-0.350027,-0.913967,-0.160091,0.192532,-0.110612,-0.468491,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4782086,2.293844,-0.669667,0.613636,0.397622,1.019903,-0.562413,-0.361920,-0.110612,-0.468491,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4782088,0.193479,1.141877,0.613636,1.198675,-0.913967,0.242232,-0.500533,-0.110612,1.899012,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4782089,-0.962901,-0.888420,-1.684629,0.130604,1.019903,-0.260671,0.192532,-0.110612,-0.468491,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4782090,-0.962901,-0.225326,-1.684629,-0.884063,1.019903,0.644554,-0.154000,-0.110612,-0.468491,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# Train/Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)


# Model fitting

In [11]:
def fit_logistic_regression(
    train: list[pd.DataFrame, pd.DataFrame],
    test: list[pd.DataFrame, pd.DataFrame],
    model_config: dict = {}
):
    logistic_model = LogisticRegression(
        C = model_config.get("C", 1),
        solver=model_config.get("solver", "liblinear"),
        random_state=model_config.get("random_state", 0),
        class_weight=model_config.get("class_weight", None)
    )
    logistic_model.fit(train[0], train[1])

    print("Training accuracy:", logistic_model.score(train[0], train[1]))
    print("Testing accuracy:", logistic_model.score(test[0], test[1]))
    print(
        classification_report(
            y_pred=logistic_model.predict(test[0]),
            y_true=test[1]
        )
    )
    # Training AUC
    fpr, tpr, thresholds = roc_curve(train[1], logistic_model.predict(train[0]), pos_label=1)
    print("Training AUC:", auc(fpr, tpr))

    # Testing AUC
    fpr, tpr, thresholds = roc_curve(test[1], logistic_model.predict(test[0]), pos_label=1)
    print("Testing AUC:", auc(fpr, tpr))
    return logistic_model

## Sanity checking

In [20]:
X_small, Y_small = X[:int(0.001*len(X))], y[:int(0.001*len(y))]
model = fit_logistic_regression(
    [X_small, Y_small],
    [X_small, Y_small],
    model_config={
        "class_weight": {
            0 :  len(Y_small)/(2*(len(Y_small)-sum(Y_small))),
            1 :  len(Y_small)/(2* sum(Y_small))
        }
    }
)

Training accuracy: 0.78113750899928
Testing accuracy: 0.78113750899928
              precision    recall  f1-score   support

         0.0       1.00      0.78      0.88      4107
         1.0       0.05      0.75      0.09        60

    accuracy                           0.78      4167
   macro avg       0.52      0.77      0.48      4167
weighted avg       0.98      0.78      0.86      4167

Training AUC: 0.7657962016070126
Testing AUC: 0.7657962016070126


## Evaluate model

In [21]:
# Weighted classes
model = fit_logistic_regression(
    [X_train, y_train],
    [X_test, y_test],
    model_config={
        "class_weight": {
            0 :  len(y_train)/(2*(len(y_train)-sum(y_train))),
            1 :  len(y_train)/(2* sum(y_train))
        }
    }
)

Training accuracy: 0.7283768770616528
Testing accuracy: 0.7292104410700218
              precision    recall  f1-score   support

         0.0       1.00      0.73      0.84   1233343
         1.0       0.04      0.76      0.07     16912

    accuracy                           0.73   1250255
   macro avg       0.52      0.75      0.46   1250255
weighted avg       0.98      0.73      0.83   1250255

Training AUC: 0.7465429496059077
Testing AUC: 0.7457902745715727


In [19]:
test_file = './../../../dataset/2023_stat.csv'

test = pd.read_csv(test_file, low_memory=False).dropna()
test_data = test[[ 'ORIG_RATE', 'ORIG_AMOUNT', 'ORIG_TERM', 'OLTV', 'NUM_BO', 'DTI', 'CSCORE_B','FTHB_FLAG', 'PUR_Cash_out', 'PUR_Refinance', 'PUR_Purchase', 'NUM_UNIT', 'OCC_Principal', 'OCC_Second', 'OCC_Investor', 'MI_PCT', 'DLQ_30_FLAG', 'DLQ_90_FLAG', 'Ongoing', 'Current_DLQ', 'Prepaid_Matured']]



# Definition of Bad Loans: once had a 30-day delinquency in performance history
# Definition of Good Loans: no delinquency and continuous payments up to current
test_data = test_data[ (test_data['DLQ_90_FLAG'] == 1) | ( (test_data['DLQ_90_FLAG'] == 0) & (test_data['Ongoing'] == 1) ) ]
test_data = test_data.astype(float)
X_test_23 = test_data.drop(columns=['DLQ_30_FLAG', 'DLQ_90_FLAG', 'Ongoing', 'Current_DLQ', 'Prepaid_Matured'])
y_test_23 = test_data['DLQ_90_FLAG']


scaler = StandardScaler()
X_test_23[num_col] = scaler.fit_transform(X_test_23[num_col])
X_test_23 = pd.concat([X_test_23[num_col], X_test_23[cat_col]], axis=1)
X_test_23

Unnamed: 0,ORIG_RATE,ORIG_AMOUNT,ORIG_TERM,OLTV,NUM_BO,DTI,CSCORE_B,NUM_UNIT,MI_PCT,FTHB_FLAG,PUR_Cash_out,PUR_Refinance,PUR_Purchase,OCC_Principal,OCC_Second,OCC_Investor
0,-0.141676,-0.654521,0.2496,-0.782075,-0.884357,-0.129848,0.917961,-0.112731,-0.737650,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.303820,-0.905238,0.2496,-1.297059,-0.884357,0.413103,-0.307562,-0.112731,-0.737650,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.371669,-0.654521,0.2496,-0.009599,0.970912,-1.215750,0.966020,-0.112731,-0.737650,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.196124,0.228690,0.2496,0.505385,-0.884357,1.173235,0.701692,-0.112731,0.156940,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.533924,0.166010,0.2496,-1.297059,0.970912,-0.021258,-3.359353,-0.112731,-0.737650,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970181,-2.681934,0.536389,0.2496,0.247893,0.970912,-0.021258,-0.283532,-0.112731,-0.737650,1.0,0.0,0.0,1.0,1.0,0.0,0.0
970182,0.696068,-0.922333,0.2496,-0.370088,-0.884357,-0.672799,1.086170,-0.112731,-0.737650,1.0,0.0,0.0,1.0,1.0,0.0,0.0
970183,1.209525,2.308507,0.2496,-0.679078,-0.884357,1.390415,0.917961,-0.112731,-0.737650,0.0,0.0,0.0,1.0,1.0,0.0,0.0
970184,1.047381,0.052047,0.2496,1.123365,0.970912,0.847464,-0.956368,-0.112731,1.871571,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [22]:

print("Testing accuracy:", model.score(X_test_23, y_test_23))
print(
    classification_report(
        y_pred=model.predict(X_test_23),
        y_true=y_test_23
    )
)

fpr, tpr, thresholds = roc_curve(y_test_23, model.predict(X_test_23), pos_label=1)
print("Testing AUC:", auc(fpr, tpr))


Testing accuracy: 0.7492100761917663
              precision    recall  f1-score   support

         0.0       1.00      0.75      0.86    927209
         1.0       0.01      0.68      0.02      2944

    accuracy                           0.75    930153
   macro avg       0.50      0.72      0.44    930153
weighted avg       1.00      0.75      0.85    930153

Testing AUC: 0.7150670524742627
