In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [2]:
file_name = '2020_stat.csv'
table = pd.read_csv(file_name, low_memory=False).dropna()

table = table[[
    'ORIG_RATE', 'ORIG_AMOUNT', 'ORIG_TERM', 'OLTV', 'NUM_BO', 'DTI', 'CSCORE_B', # 'CSCORE_C', 
    'FTHB_FLAG', 'PUR_Cash_out', 'PUR_Refinance', 'PUR_Purchase', 'NUM_UNIT', 
    'OCC_Principal', 'OCC_Second', 'OCC_Investor', 'MI_PCT', 'DLQ_30_FLAG', 'DLQ_90_FLAG', 
    'Ongoing', 'Current_DLQ', 'Prepaid_Matured']]

# Available delinquency flags: 30-day, 60-day, 90-day, and 120-day

# Definition of Bad Loans: once had a 90-day delinquency in performance history
# Definition of Good Loans: no delinquency and continuous payments up to current
table = table[ (table['DLQ_90_FLAG'] == 1) | ( (table['DLQ_90_FLAG'] == 0) & (table['Ongoing'] == 1) ) ]

X = table.drop(columns=['DLQ_30_FLAG', 'DLQ_90_FLAG', 'Ongoing', 'Current_DLQ', 'Prepaid_Matured'])
y = table['DLQ_90_FLAG']

In [3]:
# CSCORE_B and CSCORE_C are highly correlated
num_col = ['ORIG_RATE', 'ORIG_AMOUNT', 'ORIG_TERM', 'OLTV', 'NUM_BO', 'DTI', 'CSCORE_B', # 'CSCORE_C', 
           'NUM_UNIT', 'MI_PCT']
cat_col = ['FTHB_FLAG', 'PUR_Cash_out', 'PUR_Refinance', 'PUR_Purchase', 'OCC_Principal', 'OCC_Second', 'OCC_Investor']
scaler = StandardScaler()
X[num_col] = scaler.fit_transform(X[num_col])
X = pd.concat([X[num_col], X[cat_col]], axis=1)

X

Unnamed: 0,ORIG_RATE,ORIG_AMOUNT,ORIG_TERM,OLTV,NUM_BO,DTI,CSCORE_B,NUM_UNIT,MI_PCT,FTHB_FLAG,PUR_Cash_out,PUR_Refinance,PUR_Purchase,OCC_Principal,OCC_Second,OCC_Investor
5,5.967578,-0.041970,0.583235,0.558688,-0.961138,1.209533,-2.728868,-0.110778,-0.514104,0,1,0,0,1,0,0
10,2.077307,0.221725,0.583235,-0.005569,-0.961138,-0.404655,-2.285486,-0.110778,-0.514104,0,1,0,0,1,0,0
14,1.311669,1.306641,0.583235,0.558688,0.975991,-0.101995,-0.709015,-0.110778,-0.514104,0,0,1,0,1,0,0
18,1.580677,-1.013873,0.583235,1.405074,-0.961138,-0.909089,0.596500,-0.110778,2.219113,1,0,0,1,1,0,0
19,1.808299,-1.435785,0.583235,1.122945,0.975991,-2.422391,0.744294,-0.110778,1.763577,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4817638,-2.557908,-0.689905,-1.827635,-2.939707,-0.961138,-0.909089,0.399441,-0.110778,-0.514104,1,0,0,1,1,0,0
4817639,-1.274947,0.372408,0.583235,-0.795529,-0.961138,-1.817070,0.547235,-0.110778,-0.514104,0,0,1,0,1,0,0
4817642,-0.488616,-1.021407,0.583235,-1.359787,-0.961138,1.209533,0.941353,-0.110778,-0.514104,0,0,1,0,1,0,0
4817645,0.546031,-1.157022,0.583235,0.558688,-0.961138,1.007759,-2.605706,-0.110778,-0.514104,1,0,0,1,1,0,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

svm_model = LinearSVC(random_state=99, max_iter=19999, class_weight='balanced')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {round(accuracy, 4)}")
print(f"AUC-ROC: {round(roc_auc, 4)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)

Accuracy: 0.7614
AUC-ROC: 0.7623
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.76      0.86    950239
           1       0.09      0.76      0.15     27789

    accuracy                           0.76    978028
   macro avg       0.54      0.76      0.51    978028
weighted avg       0.97      0.76      0.84    978028



In [5]:
print(round(y_test.sum() / len(y_test), 4))
print(round(y_pred.sum() / len(y_pred), 4))

0.0284
0.2536
