In [3]:
# from pyspark.sql import SparkSession
import pandas as pd
import warnings
import sys
sys.path.append('../src')
from data import load_and_concat_csv, analyze_dataframe, visualize_NA_percentage
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
print("Package Imported")

Package Imported


In [4]:
# Define the Loan Performance table headers and data types
lppub_column_names = [
    "POOL_ID", "LOAN_ID", "ACT_PERIOD", "CHANNEL", "SELLER", "SERVICER",
    "MASTER_SERVICER", "ORIG_RATE", "CURR_RATE", "ORIG_UPB", "ISSUANCE_UPB",
    "CURRENT_UPB", "ORIG_TERM", "ORIG_DATE", "FIRST_PAY", "LOAN_AGE",
    "REM_MONTHS", "ADJ_REM_MONTHS", "MATR_DT", "OLTV", "OCLTV",
    "NUM_BO", "DTI", "CSCORE_B", "CSCORE_C", "FIRST_FLAG", "PURPOSE",
    "PROP", "NO_UNITS", "OCC_STAT", "STATE", "MSA", "ZIP", "MI_PCT",
    "PRODUCT", "PPMT_FLG", "IO", "FIRST_PAY_IO", "MNTHS_TO_AMTZ_IO",
    "DLQ_STATUS", "PMT_HISTORY", "MOD_FLAG", "MI_CANCEL_FLAG", "Zero_Bal_Code",
    "ZB_DTE", "LAST_UPB", "RPRCH_DTE", "CURR_SCHD_PRNCPL", "TOT_SCHD_PRNCPL",
    "UNSCHD_PRNCPL_CURR", "LAST_PAID_INSTALLMENT_DATE", "FORECLOSURE_DATE",
    "DISPOSITION_DATE", "FORECLOSURE_COSTS", "PROPERTY_PRESERVATION_AND_REPAIR_COSTS",
    "ASSET_RECOVERY_COSTS", "MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS",
    "ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY", "NET_SALES_PROCEEDS",
    "CREDIT_ENHANCEMENT_PROCEEDS", "REPURCHASES_MAKE_WHOLE_PROCEEDS",
    "OTHER_FORECLOSURE_PROCEEDS", "NON_INTEREST_BEARING_UPB", "PRINCIPAL_FORGIVENESS_AMOUNT",
    "ORIGINAL_LIST_START_DATE", "ORIGINAL_LIST_PRICE", "CURRENT_LIST_START_DATE",
    "CURRENT_LIST_PRICE", "ISSUE_SCOREB", "ISSUE_SCOREC", "CURR_SCOREB",
    "CURR_SCOREC", "MI_TYPE", "SERV_IND", "CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT",
    "CUMULATIVE_MODIFICATION_LOSS_AMOUNT", "CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS",
    "CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS", "HOMEREADY_PROGRAM_INDICATOR",
    "FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT", "RELOCATION_MORTGAGE_INDICATOR",
    "ZERO_BALANCE_CODE_CHANGE_DATE", "LOAN_HOLDBACK_INDICATOR", "LOAN_HOLDBACK_EFFECTIVE_DATE",
    "DELINQUENT_ACCRUED_INTEREST", "PROPERTY_INSPECTION_WAIVER_INDICATOR",
    "HIGH_BALANCE_LOAN_INDICATOR", "ARM_5_YR_INDICATOR", "ARM_PRODUCT_TYPE",
    "MONTHS_UNTIL_FIRST_PAYMENT_RESET", "MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET",
    "INTEREST_RATE_CHANGE_DATE", "PAYMENT_CHANGE_DATE", "ARM_INDEX",
    "ARM_CAP_STRUCTURE", "INITIAL_INTEREST_RATE_CAP", "PERIODIC_INTEREST_RATE_CAP",
    "LIFETIME_INTEREST_RATE_CAP", "MARGIN", "BALLOON_INDICATOR",
    "PLAN_NUMBER", "FORBEARANCE_INDICATOR", "HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR",
    "DEAL_NAME", "RE_PROCS_FLAG", "ADR_TYPE", "ADR_COUNT", "ADR_UPB", "PAYMENT_DEFERRAL_MOD_EVENT_FLAG", "INTEREST_BEARING_UPB"
]

# Define column data types as a dictionary
lppub_column_classes = {
    "POOL_ID": "string", "LOAN_ID": "string", "ACT_PERIOD": "string", "CHANNEL": "string", "SELLER": "string", 
    "SERVICER": "string", "MASTER_SERVICER": "string", "ORIG_RATE": "float64", "CURR_RATE": "float64",
    "ORIG_UPB": "float64", "ISSUANCE_UPB": "float64", "CURRENT_UPB": "float64", "ORIG_TERM": "float64", 
    "ORIG_DATE": "string", "FIRST_PAY": "string", "LOAN_AGE": "float64", "REM_MONTHS": "float64", 
    "ADJ_REM_MONTHS": "float64", "MATR_DT": "string", "OLTV": "float64", "OCLTV": "float64", 
    "NUM_BO": "string", "DTI": "float64", "CSCORE_B": "float64", "CSCORE_C": "float64", "FIRST_FLAG": "string", 
    "PURPOSE": "string", "PROP": "string", "NO_UNITS": "float64", "OCC_STAT": "string", "STATE": "string", 
    "MSA": "string", "ZIP": "string", "MI_PCT": "float64", "PRODUCT": "string", "PPMT_FLG": "string", 
    "IO": "string", "FIRST_PAY_IO": "string", "MNTHS_TO_AMTZ_IO": "float64", "DLQ_STATUS": "string", 
    "PMT_HISTORY": "string", "MOD_FLAG": "string", "MI_CANCEL_FLAG": "string", "Zero_Bal_Code": "string", 
    "ZB_DTE": "string", "LAST_UPB": "float64", "RPRCH_DTE": "string", "CURR_SCHD_PRNCPL": "float64", 
    "TOT_SCHD_PRNCPL": "float64", "UNSCHD_PRNCPL_CURR": "float64", "LAST_PAID_INSTALLMENT_DATE": "string", 
    "FORECLOSURE_DATE": "string", "DISPOSITION_DATE": "string", "FORECLOSURE_COSTS": "float64", 
    "PROPERTY_PRESERVATION_AND_REPAIR_COSTS": "float64", "ASSET_RECOVERY_COSTS": "float64", 
    "MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS": "float64", "ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY": "float64", 
    "NET_SALES_PROCEEDS": "float64", "CREDIT_ENHANCEMENT_PROCEEDS": "float64", 
    "REPURCHASES_MAKE_WHOLE_PROCEEDS": "float64", "OTHER_FORECLOSURE_PROCEEDS": "float64", 
    "NON_INTEREST_BEARING_UPB": "float64", "PRINCIPAL_FORGIVENESS_AMOUNT": "float64", 
    "ORIGINAL_LIST_START_DATE": "string", "ORIGINAL_LIST_PRICE": "float64", "CURRENT_LIST_START_DATE": "string", 
    "CURRENT_LIST_PRICE": "float64", "ISSUE_SCOREB": "float64", "ISSUE_SCOREC": "float64", "CURR_SCOREB": "float64", 
    "CURR_SCOREC": "float64", "MI_TYPE": "string", "SERV_IND": "string", 
    "CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT": "float64", "CUMULATIVE_MODIFICATION_LOSS_AMOUNT": "float64", 
    "CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS": "float64", "CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS": "float64", 
    "HOMEREADY_PROGRAM_INDICATOR": "string", "FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT": "float64", 
    "RELOCATION_MORTGAGE_INDICATOR": "string", "ZERO_BALANCE_CODE_CHANGE_DATE": "string", 
    "LOAN_HOLDBACK_INDICATOR": "string", "LOAN_HOLDBACK_EFFECTIVE_DATE": "string", "DELINQUENT_ACCRUED_INTEREST": "float64", 
    "PROPERTY_INSPECTION_WAIVER_INDICATOR": "string", "HIGH_BALANCE_LOAN_INDICATOR": "string", 
    "ARM_5_YR_INDICATOR": "string", "ARM_PRODUCT_TYPE": "string", "MONTHS_UNTIL_FIRST_PAYMENT_RESET": "float64", 
    "MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET": "float64", "INTEREST_RATE_CHANGE_DATE": "string", 
    "PAYMENT_CHANGE_DATE": "string", "ARM_INDEX": "string", "ARM_CAP_STRUCTURE": "string", 
    "INITIAL_INTEREST_RATE_CAP": "float64", "PERIODIC_INTEREST_RATE_CAP": "float64", 
    "LIFETIME_INTEREST_RATE_CAP": "float64", "MARGIN": "float64", "BALLOON_INDICATOR": "string", 
    "PLAN_NUMBER": "string", "FORBEARANCE_INDICATOR": "string", "HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR": "string", 
    "DEAL_NAME": "string", "RE_PROCS_FLAG": "string", "ADR_TYPE": "string", "ADR_COUNT": "float64", 
    "ADR_UPB": "float64", "PAYMENT_DEFERRAL_MOD_EVENT_FLAG": "string", "INTEREST_BEARING_UPB": "float64"
}

In [5]:
file_path = '../data/raw/2016Q1.csv'
chunk_size = 100000
df = load_and_concat_csv(file_path, lppub_column_names, lppub_column_classes, chunk_size=chunk_size)
df.head(10)

Unnamed: 0,POOL_ID,LOAN_ID,ACT_PERIOD,CHANNEL,SELLER,SERVICER,MASTER_SERVICER,ORIG_RATE,CURR_RATE,ORIG_UPB,...,PLAN_NUMBER,FORBEARANCE_INDICATOR,HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR,DEAL_NAME,RE_PROCS_FLAG,ADR_TYPE,ADR_COUNT,ADR_UPB,PAYMENT_DEFERRAL_MOD_EVENT_FLAG,INTEREST_BEARING_UPB
0,,100000512540,22016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
1,,100000512540,32016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
2,,100000512540,42016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
3,,100000512540,52016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
4,,100000512540,62016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
5,,100000512540,72016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
6,,100000512540,82016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
7,,100000512540,92016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
8,,100000512540,102016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,
9,,100000512540,112016,C,Franklin American Mortgage Company,Other,,3.75,3.75,65000.0,...,,,N,,,,,,7,


In [None]:
threshold = 0.8 * len(df)
final_df = df.dropna(axis=1, thresh=threshold)
final_df.head(10)

In [None]:
print(final_df.columns)

In [None]:
string_columns = final_df.select_dtypes(include='string').columns
unique_values = {col: final_df[col].unique() for col in string_columns}
for col, values in unique_values.items():
    print(f"Unique values in column '{col}': {values}")