In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
file_name = '2023Q4_stat.csv'
baseTable = pd.read_csv(file_name, low_memory=False)

n_row, n_col = baseTable.shape
print(f"Number of rows: {n_row}")
print(f"Number of columns: {n_col}")


Number of rows: 215934
Number of columns: 74


In [3]:
# LOAN_ID is kept as an identifier, will be removed later
# AQSN_DTE is the same throughout the file

bt1 = baseTable[[
    'LOAN_ID', 'orig_rt', 'orig_amt', 'orig_trm', 'ocltv', 'num_bo', 'dti',
    'CSCORE_B', 'FTHB_FLG', 'purpose', 'PROP_TYP', 'NUM_UNIT', 'occ_stat',
    'mi_pct', 'CSCORE_C', 'MI_TYPE', 'AQSN_DTE', 'ORIG_DTE', 'FRST_DTE',
    'F30_DTE', 'F60_DTE', 'F90_DTE', 'F120_DTE', 'F180_DTE', 'FCE_DTE'
]].rename(columns={
    'orig_rt': 'ORIG_RATE', 'orig_amt': 'ORIG_AMOUNT', 'orig_trm': 'ORIG_TERM',
    'ocltv': 'OCLTV', 'num_bo': 'NUM_BO', 'dti': 'DTI', 'FTHB_FLG': 'FTHB_FLAG',
    'purpose': 'PURPOSE', 'PROP_TYP': 'PROP_TYPE', 'occ_stat': 'OCC_STAT', 'mi_pct': 'MI_PCT'
})


bt1['AQSN_DTE'] = pd.to_datetime(bt1['AQSN_DTE'])
bt1['ORIG_DTE'] = pd.to_datetime(bt1['ORIG_DTE'])
bt1['FRST_DTE'] = pd.to_datetime(bt1['FRST_DTE'])
bt1['F30_DTE'] = pd.to_datetime(bt1['F30_DTE'])
bt1['F60_DTE'] = pd.to_datetime(bt1['F60_DTE'])
bt1['F90_DTE'] = pd.to_datetime(bt1['F90_DTE'])
bt1['F120_DTE'] = pd.to_datetime(bt1['F120_DTE'])
bt1['F180_DTE'] = pd.to_datetime(bt1['F180_DTE'])
bt1['FCE_DTE'] = pd.to_datetime(bt1['FCE_DTE'])

# Origination date is at most two months earlier than acquisition date
bt1['date_diff'] = (bt1['AQSN_DTE'] - bt1['ORIG_DTE']).dt.days
bt1 = bt1[bt1['date_diff'] <= 70]
n_row, n_col = bt1.shape
print(f"Number of rows: {n_row}")
print(f"Number of columns: {n_col}")


Number of rows: 133402
Number of columns: 26


In [4]:
bt2 = bt1
del baseTable

bt2['FTHB_FLAG'] = bt2['FTHB_FLAG'].replace({'Y': 1, 'N': 0})

# Check Glossary #27
bt2['PUR_Cash_out'] = (bt2['PURPOSE'] == 'C').astype(int)
bt2['PUR_Refinance'] = (bt2['PURPOSE'] == 'R').astype(int)
bt2['PUR_Purchase'] = (bt2['PURPOSE'] == 'P').astype(int)
# Check Glossary #28
bt2['PRO_Condominium'] = (bt2['PROP_TYPE'] == 'CO').astype(int)
bt2['PRO_Co_operative'] = (bt2['PROP_TYPE'] == 'CP').astype(int)
bt2['PRO_Planned_Urban'] = (bt2['PROP_TYPE'] == 'PU').astype(int)
bt2['PRO_Manufact_Home'] = (bt2['PROP_TYPE'] == 'MH').astype(int)
bt2['PRO_Single_Family'] = (bt2['PROP_TYPE'] == 'SF').astype(int)
# Check Glossary #30
bt2['OCC_Principal'] = (bt2['OCC_STAT'] == 'P').astype(int)
bt2['OCC_Second'] = (bt2['OCC_STAT'] == 'S').astype(int)
bt2['OCC_Investor'] = (bt2['OCC_STAT'] == 'I').astype(int)

bt2['MI_PCT'] = bt2['MI_PCT'].fillna(0)
# Check Glossary #73
bt2['MI_Borrower'] = (bt2['MI_TYPE'] == 'BPMI').astype(int)
bt2['MI_Lender'] = (bt2['MI_TYPE'] == 'LPMI').astype(int)
bt2['MI_Investor'] = (bt2['MI_TYPE'] == 'IPMI').astype(int)  # seems trivial

# If there is no co-borrower, CSCORE_C is set to be same as CSCORE_B
bt2['CSCORE_C'] = bt2['CSCORE_C'].fillna(bt2['CSCORE_B'])

n_row, n_col = bt2.shape
print(f"Number of rows: {n_row}")
print(f"Number of columns: {n_col}")

Number of rows: 133402
Number of columns: 40


  bt2['FTHB_FLAG'] = bt2['FTHB_FLAG'].replace({'Y': 1, 'N': 0})


In [5]:
bt2['DLQ_FLAG'] = bt2[['F30_DTE', 'F60_DTE', 'F90_DTE', 'F120_DTE', 'F180_DTE', 'FCE_DTE']].notna().any(axis=1).astype(int)
print(bt2['DLQ_FLAG'])

2         0
4         0
10        0
11        0
14        0
         ..
215929    0
215930    0
215931    0
215932    0
215933    0
Name: DLQ_FLAG, Length: 133402, dtype: int64


In [9]:
# Select and reorder columns
bt3 = bt2[[
    'LOAN_ID', 'ORIG_RATE', 'ORIG_AMOUNT', 'ORIG_TERM', 'OCLTV', 'NUM_BO', 'DTI', 'CSCORE_B', 'CSCORE_C',
    'FTHB_FLAG', 'PUR_Cash_out', 'PUR_Refinance', 'PUR_Purchase', 'PRO_Condominium', 'PRO_Co_operative',
    'PRO_Planned_Urban', 'PRO_Manufact_Home', 'PRO_Single_Family', 'NUM_UNIT', 'OCC_Principal', 'OCC_Second', 
    'OCC_Investor', 'MI_PCT', 'MI_Borrower', 'MI_Lender', 'MI_Investor', 'AQSN_DTE', 'ORIG_DTE', 'FRST_DTE',
    'DLQ_FLAG'
]]

del bt1

output = f"Preprocessed_{file_name}"
bt3.to_csv(output, sep=",", na_rep="NULL", float_format='%.2f', index=False, quoting=1)
