## Import

In [None]:
# from pyspark.sql import SparkSession
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
print("Package Imported")

In [2]:
import pandas as pd

# Function to load the Loan Performance file
def load_lppub_file(filename, col_names, col_types):
    # Use pandas to read the file with specific column names and types
    df = pd.read_csv(filename, sep="|", names=col_names, dtype=col_types)
    return df

# Define the Loan Performance table headers and data types
lppub_column_names = [
    "POOL_ID", "LOAN_ID", "ACT_PERIOD", "CHANNEL", "SELLER", "SERVICER",
    "MASTER_SERVICER", "ORIG_RATE", "CURR_RATE", "ORIG_UPB", "ISSUANCE_UPB",
    "CURRENT_UPB", "ORIG_TERM", "ORIG_DATE", "FIRST_PAY", "LOAN_AGE",
    "REM_MONTHS", "ADJ_REM_MONTHS", "MATR_DT", "OLTV", "OCLTV",
    "NUM_BO", "DTI", "CSCORE_B", "CSCORE_C", "FIRST_FLAG", "PURPOSE",
    "PROP", "NO_UNITS", "OCC_STAT", "STATE", "MSA", "ZIP", "MI_PCT",
    "PRODUCT", "PPMT_FLG", "IO", "FIRST_PAY_IO", "MNTHS_TO_AMTZ_IO",
    "DLQ_STATUS", "PMT_HISTORY", "MOD_FLAG", "MI_CANCEL_FLAG", "Zero_Bal_Code",
    "ZB_DTE", "LAST_UPB", "RPRCH_DTE", "CURR_SCHD_PRNCPL", "TOT_SCHD_PRNCPL",
    "UNSCHD_PRNCPL_CURR", "LAST_PAID_INSTALLMENT_DATE", "FORECLOSURE_DATE",
    "DISPOSITION_DATE", "FORECLOSURE_COSTS", "PROPERTY_PRESERVATION_AND_REPAIR_COSTS",
    "ASSET_RECOVERY_COSTS", "MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS",
    "ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY", "NET_SALES_PROCEEDS",
    "CREDIT_ENHANCEMENT_PROCEEDS", "REPURCHASES_MAKE_WHOLE_PROCEEDS",
    "OTHER_FORECLOSURE_PROCEEDS", "NON_INTEREST_BEARING_UPB", "PRINCIPAL_FORGIVENESS_AMOUNT",
    "ORIGINAL_LIST_START_DATE", "ORIGINAL_LIST_PRICE", "CURRENT_LIST_START_DATE",
    "CURRENT_LIST_PRICE", "ISSUE_SCOREB", "ISSUE_SCOREC", "CURR_SCOREB",
    "CURR_SCOREC", "MI_TYPE", "SERV_IND", "CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT",
    "CUMULATIVE_MODIFICATION_LOSS_AMOUNT", "CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS",
    "CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS", "HOMEREADY_PROGRAM_INDICATOR",
    "FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT", "RELOCATION_MORTGAGE_INDICATOR",
    "ZERO_BALANCE_CODE_CHANGE_DATE", "LOAN_HOLDBACK_INDICATOR", "LOAN_HOLDBACK_EFFECTIVE_DATE",
    "DELINQUENT_ACCRUED_INTEREST", "PROPERTY_INSPECTION_WAIVER_INDICATOR",
    "HIGH_BALANCE_LOAN_INDICATOR", "ARM_5_YR_INDICATOR", "ARM_PRODUCT_TYPE",
    "MONTHS_UNTIL_FIRST_PAYMENT_RESET", "MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET",
    "INTEREST_RATE_CHANGE_DATE", "PAYMENT_CHANGE_DATE", "ARM_INDEX",
    "ARM_CAP_STRUCTURE", "INITIAL_INTEREST_RATE_CAP", "PERIODIC_INTEREST_RATE_CAP",
    "LIFETIME_INTEREST_RATE_CAP", "MARGIN", "BALLOON_INDICATOR",
    "PLAN_NUMBER", "FORBEARANCE_INDICATOR", "HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR",
    "DEAL_NAME", "RE_PROCS_FLAG", "ADR_TYPE", "ADR_COUNT", "ADR_UPB", "PAYMENT_DEFERRAL_MOD_EVENT_FLAG", "INTEREST_BEARING_UPB"
]

# Define column data types as a dictionary
lppub_column_classes = {
    "POOL_ID": "string", "LOAN_ID": "string", "ACT_PERIOD": "string", "CHANNEL": "string", "SELLER": "string", 
    "SERVICER": "string", "MASTER_SERVICER": "string", "ORIG_RATE": "float64", "CURR_RATE": "float64",
    "ORIG_UPB": "float64", "ISSUANCE_UPB": "float64", "CURRENT_UPB": "float64", "ORIG_TERM": "float64", 
    "ORIG_DATE": "string", "FIRST_PAY": "string", "LOAN_AGE": "float64", "REM_MONTHS": "float64", 
    "ADJ_REM_MONTHS": "float64", "MATR_DT": "string", "OLTV": "float64", "OCLTV": "float64", 
    "NUM_BO": "string", "DTI": "float64", "CSCORE_B": "float64", "CSCORE_C": "float64", "FIRST_FLAG": "string", 
    "PURPOSE": "string", "PROP": "string", "NO_UNITS": "float64", "OCC_STAT": "string", "STATE": "string", 
    "MSA": "string", "ZIP": "string", "MI_PCT": "float64", "PRODUCT": "string", "PPMT_FLG": "string", 
    "IO": "string", "FIRST_PAY_IO": "string", "MNTHS_TO_AMTZ_IO": "float64", "DLQ_STATUS": "string", 
    "PMT_HISTORY": "string", "MOD_FLAG": "string", "MI_CANCEL_FLAG": "string", "Zero_Bal_Code": "string", 
    "ZB_DTE": "string", "LAST_UPB": "float64", "RPRCH_DTE": "string", "CURR_SCHD_PRNCPL": "float64", 
    "TOT_SCHD_PRNCPL": "float64", "UNSCHD_PRNCPL_CURR": "float64", "LAST_PAID_INSTALLMENT_DATE": "string", 
    "FORECLOSURE_DATE": "string", "DISPOSITION_DATE": "string", "FORECLOSURE_COSTS": "float64", 
    "PROPERTY_PRESERVATION_AND_REPAIR_COSTS": "float64", "ASSET_RECOVERY_COSTS": "float64", 
    "MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS": "float64", "ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY": "float64", 
    "NET_SALES_PROCEEDS": "float64", "CREDIT_ENHANCEMENT_PROCEEDS": "float64", 
    "REPURCHASES_MAKE_WHOLE_PROCEEDS": "float64", "OTHER_FORECLOSURE_PROCEEDS": "float64", 
    "NON_INTEREST_BEARING_UPB": "float64", "PRINCIPAL_FORGIVENESS_AMOUNT": "float64", 
    "ORIGINAL_LIST_START_DATE": "string", "ORIGINAL_LIST_PRICE": "float64", "CURRENT_LIST_START_DATE": "string", 
    "CURRENT_LIST_PRICE": "float64", "ISSUE_SCOREB": "float64", "ISSUE_SCOREC": "float64", "CURR_SCOREB": "float64", 
    "CURR_SCOREC": "float64", "MI_TYPE": "string", "SERV_IND": "string", 
    "CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT": "float64", "CUMULATIVE_MODIFICATION_LOSS_AMOUNT": "float64", 
    "CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS": "float64", "CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS": "float64", 
    "HOMEREADY_PROGRAM_INDICATOR": "string", "FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT": "float64", 
    "RELOCATION_MORTGAGE_INDICATOR": "string", "ZERO_BALANCE_CODE_CHANGE_DATE": "string", 
    "LOAN_HOLDBACK_INDICATOR": "string", "LOAN_HOLDBACK_EFFECTIVE_DATE": "string", "DELINQUENT_ACCRUED_INTEREST": "float64", 
    "PROPERTY_INSPECTION_WAIVER_INDICATOR": "string", "HIGH_BALANCE_LOAN_INDICATOR": "string", 
    "ARM_5_YR_INDICATOR": "string", "ARM_PRODUCT_TYPE": "string", "MONTHS_UNTIL_FIRST_PAYMENT_RESET": "float64", 
    "MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET": "float64", "INTEREST_RATE_CHANGE_DATE": "string", 
    "PAYMENT_CHANGE_DATE": "string", "ARM_INDEX": "string", "ARM_CAP_STRUCTURE": "string", 
    "INITIAL_INTEREST_RATE_CAP": "float64", "PERIODIC_INTEREST_RATE_CAP": "float64", 
    "LIFETIME_INTEREST_RATE_CAP": "float64", "MARGIN": "float64", "BALLOON_INDICATOR": "string", 
    "PLAN_NUMBER": "string", "FORBEARANCE_INDICATOR": "string", "HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR": "string", 
    "DEAL_NAME": "string", "RE_PROCS_FLAG": "string", "ADR_TYPE": "string", "ADR_COUNT": "float64", 
    "ADR_UPB": "float64", "PAYMENT_DEFERRAL_MOD_EVENT_FLAG": "string", "INTEREST_BEARING_UPB": "float64"
}

In [6]:
file_path = '../data/raw/2016Q1.csv'
result = []
chunk_size = 100000
for chunk in pd.read_csv(file_path, sep='|', names=lppub_column_names, chunksize=chunk_size, dtype=lppub_column_classes):
    result.append(chunk)
final_df = pd.concat(result, ignore_index=True)

In [None]:
final_df.head(20)

In [None]:
threshold = 0.8 * len(final_df)
final_df = final_df.dropna(axis=1, thresh=threshold)
final_df.head(10)

In [None]:
print(final_df.columns)

In [None]:
unique_dlq_status = final_df['DLQ_STATUS'].unique()
print(unique_dlq_status)

In [14]:
final_df['date_time'] = pd.to_datetime(final_df['ACT_PERIOD'], format='%m%Y')
final_df['date_time'] = final_df['date_time'].dt.strftime('%Y-%m')
max_date_row = final_df.loc[final_df['date_time'].idxmax()]


In [None]:
print(max_date_row["DLQ_STATUS"])

In [None]:
custome_headers =  ['Loan ID', 
                    'Channel', 
                    'Servicer Name', 
                    'Original Interest Rate', 
                    'Current Interest Rate', 
                    "Original UPB", 
                    "UPB at Issuance",
                    "Current Actual UPB",
                    "Loan Age",
                    "Remaing Months To Maturity",
                    "Orginal Loan to Value Ratio",
                    "Original Combined Loan to Value Ratio (CLTV)",
                    "Number of Borrowers",
                    "Debt-To-Income (DTI)",
                    "Borrower Credit Score at Origination",
                    "Co-Borrower Credit Score at Origination",
                    "First Time Home Buyer",
                    "Loan Purpose",
                    "Property Type",
                    "Occupancy Status",
                    "Mortgage Insurance Percentage",
                    "Amortization Type", 
                    "Current Loan Delinquency Status",
                    "Repurchase Date",
                    "Property Preservation and Repair Cost",
                    "Cumulative Modification Loss Amount",
                    "High Balance Loan Indicator",
                    "Lifetime Interest Rate Cap Up Percent",
                    "High Loan to Value (HLTV) Refinance Option Indicator",
                    ]
print(len(custome_headers))
selected_indices = [1, 3, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 33, 34, 39, 40, 47, 54, 75, 86, 97, 102]


In [None]:
print("update")