In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [2]:
file_name = '2021_stat.csv'
table = pd.read_csv(file_name, low_memory=False).dropna()

table = table[[
    'ORIG_RATE', 'ORIG_AMOUNT', 'ORIG_TERM', 'OLTV', 'NUM_BO', 'DTI', 'CSCORE_B', # 'CSCORE_C', 
    'FTHB_FLAG', 'PUR_Cash_out', 'PUR_Refinance', 'PUR_Purchase', 'NUM_UNIT', 
    'OCC_Principal', 'OCC_Second', 'OCC_Investor', 'MI_PCT', 'DLQ_FLAG', 'Ongoing', 
    'Current_DLQ', 'Prepaid_Matured']]

# Definition of Bad Loans: once had a 30-day delinquency in performance history
# Definition of Good Loans: no delinquency and continuous payments up to current
table = table[ (table['DLQ_FLAG'] == 1) | ( (table['DLQ_FLAG'] == 0) & (table['Ongoing'] == 1) ) ]

X = table.drop(columns=['DLQ_FLAG', 'Ongoing', 'Current_DLQ', 'Prepaid_Matured'])
y = table['DLQ_FLAG']

In [3]:
num_col = ['ORIG_RATE', 'ORIG_AMOUNT', 'ORIG_TERM', 'OLTV', 'NUM_BO', 'DTI', 'CSCORE_B', # 'CSCORE_C', 
           'NUM_UNIT', 'MI_PCT']
cat_col = ['FTHB_FLAG', 'PUR_Cash_out', 'PUR_Refinance', 'PUR_Purchase', 'OCC_Principal', 'OCC_Second', 'OCC_Investor']
scaler = StandardScaler()
X[num_col] = scaler.fit_transform(X[num_col])
X = pd.concat([X[num_col], X[cat_col]], axis=1)

X

Unnamed: 0,ORIG_RATE,ORIG_AMOUNT,ORIG_TERM,OLTV,NUM_BO,DTI,CSCORE_B,NUM_UNIT,MI_PCT,FTHB_FLAG,PUR_Cash_out,PUR_Refinance,PUR_Purchase,OCC_Principal,OCC_Second,OCC_Investor
0,-1.861114,2.298143,-1.690565,0.073896,-0.910714,1.446056,0.845006,-0.110638,-0.469892,0,0,1,0,1,0,0
1,-0.684411,2.414368,0.611493,0.661563,1.023411,-0.063434,0.707429,-0.110638,-0.469892,1,0,0,1,1,0,0
3,-0.378469,2.954470,0.611493,1.302654,-0.910714,0.137831,0.913795,-0.110638,2.365326,1,0,0,1,1,0,0
4,-0.684411,3.193756,0.611493,-2.063074,1.023411,-0.365332,0.730358,-0.110638,-0.469892,0,0,1,0,1,0,0
5,-0.966820,2.202429,0.611493,-0.353498,-0.910714,-0.164067,0.202979,-0.110638,-0.469892,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4782086,2.280880,-0.668999,0.611493,0.394442,1.023411,-0.566597,-0.347331,-0.110638,-0.469892,0,1,0,0,0,0,1
4782088,0.186349,1.142735,0.611493,1.195806,-0.910714,0.238464,-0.484908,-0.110638,1.892790,1,0,0,1,1,0,0
4782089,-0.966820,-0.887774,-1.690565,0.127321,1.023411,-0.264699,0.202979,-0.110638,-0.469892,0,1,0,0,1,0,0
4782090,-0.966820,-0.224611,-1.690565,-0.887741,1.023411,0.640994,-0.140965,-0.110638,-0.469892,0,1,0,0,1,0,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

svm_model = LinearSVC(random_state=99, max_iter=8192, class_weight='balanced')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"AUC-ROC: {roc_auc}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)

Accuracy: 0.659788831619591
AUC-ROC: 0.6313363944682062
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.67      0.78   1151850
           1       0.15      0.60      0.24    116427

    accuracy                           0.66   1268277
   macro avg       0.55      0.63      0.51   1268277
weighted avg       0.87      0.66      0.73   1268277



In [8]:
print(y_test.sum() / len(y_test))
print(y_pred.sum() / len(y_pred))

0.09179934667269059
0.3579257528126742
