In [93]:
import pandas as pd

# Load the CSV file
#df = pd.read_csv("../1_datasets/raw_data/public2024.csv")
df = pd.read_csv(
    r"D:\GitHub\ET6-CDSP-group-15-repo\1_datasets\raw_data\public2024.csv",
    encoding="utf-8",
)

  df = pd.read_csv(


Relevant Codes/Columns
## 1. Demographic Features
- ppage, ppagecat, ppagect4 – Age
- ppeduc5, ppeducat, educ_4cat – Education
- ppemploy – Employment status
- ppethm, ppracem, race_5cat – Race/Ethnicity
- ppgender – Gender
- pphhsize, pphhsize5 – Household size
- ppmarit5 – Marital status
- ppinc7, inc_4cat_50k, I40 – Household income
- pphouse4, pprent – Housing type/ownership
- ppreg4, ppreg9, ppstaten – Region/State
- ppkid017, ppt18ov – Household composition (children/adults)
- Status – Armed Forces status
- pphispan – Hispanic/Latino status
  
## 2. Financial Behavior Features
- B2, B3, B3A_*, B3B_* – Financial well-being and changes
- X12_* – Financial challenges/concerns
- BK1 – Bank account ownership
- BK2_* – Use of alternative financial services (payday, pawn, etc.)
- A0, A7_*, A8_* – Credit applications and denials
- C2A, C3P, C4A – Credit card usage and payment behavior
- EF1, EF2, EF3_*, EF5C, EF6C_*, EF7 – Emergency funds, bill payment, financial resilience
- SL1, SL3, SL4A, SL4, SL6 – Student loan status, payment, and delinquency
- R11 – Behind on rent
- M4 – Mortgage payment
- ND0 – Financial impact of disasters
pay_casheqv, atleast_okay – Financial health indicators
## 3. Default/Delinquency Indicators
- BNPL3 – Late payment on BNPL
- BNPL3A – Charged extra for late BNPL payment
- SL6 – Behind on student loan payments
- R11 – Behind on rent
- BK2_c, BK2_d, BK2_f – Use of payday/pawn/overdraft (proxy for financial distress)
- EF5C – Paid all bills in full last month
## 4. BNPL-Specific Variables
- BNPL1 – Used BNPL in past year
- BNPL3, BNPL3A – Late/extra charges on BNPL
- BNPL4_* – Reasons for using BNPL
## 5. Traditional Loan Variables
- A7_*, A8_* – Applications/denials for credit card, auto, student, mortgage, home equity, other loans
- SL1, SL3, SL4A, SL4, SL6 – Student loan status and delinquency
- M4 – Mortgage payment
- R11 – Rent delinquency

In [94]:
# Define relevant columns for analysis
relevant_columns = [
    # Demographics
    "ppage",
    "ppeduc5",
    "ppemploy",
    "ppethm",
    "ppgender",
    "pphhsize",
    "ppmarit5",
    "ppinc7",
    "pphouse4",
    "pprent",
    # Financial behavior
    "B2",
    "B3",
    "BK1",
    "BK2_a",
    "BK2_b",
    "BK2_c",
    "BK2_d",
    "BK2_f",
    "A0",
    "C2A",
    "C3P",
    "C4A",
    "EF1",
    "EF2",
    "EF3_a",
    "EF3_b",
    "EF3_c",
    "EF3_d",
    "EF3_e",
    "EF3_f",
    "EF3_g",
    "EF3_h",
    "EF5C",
    "EF6C_a",
    "EF6C_b",
    "EF6C_c",
    "EF6C_d",
    "EF7",
    # BNPL
    "BNPL1",
    "BNPL3",
    "BNPL3A",
    # Default/delinquency indicators
    "SL6",
    "R11",
    # Traditional loan variables
    "SL1",
    "SL3",
    "SL4A",
    "SL4",
    "M4",
    # Case ID for reference
    "CaseID",
]

# Subset the DataFrame
df_relevant = df[relevant_columns]
df_relevant.shape

(12295, 49)

In [95]:
# Check for missing values in each column
df_nulls = df_relevant.isnull().sum().reset_index()
df_nulls.columns = ['Column', 'Null_Count']

# Show only columns with at least one null value in the null count table
df_nulls_nonzero = df_nulls[df_nulls['Null_Count'] > 0]
display(df_nulls_nonzero)

Unnamed: 0,Column,Null_Count
17,BK2_f,648
20,C3P,1946
21,C4A,1946
23,EF2,7151
33,EF6C_a,10628
34,EF6C_b,10628
35,EF6C_c,10628
36,EF6C_d,10628
39,BNPL3,10625
40,BNPL3A,11937


In [96]:
# Drop columns with more than 70% null values from df_relevant
df_relevant = df_relevant.loc[:, df_relevant.isnull().mean() <= 0.7]
print(f"Columns remaining after dropping: {df_relevant.shape[1]}")

Columns remaining after dropping: 38


In [97]:
df_relevant.isnull().sum()  # Check for missing values in each column
# Display a table with columns and their sum of null values
null_counts = df_relevant.isnull().sum()
null_counts_table = pd.DataFrame({
    "Column": null_counts.index,
    "Null Count": null_counts.values
})
display(null_counts_table)

Unnamed: 0,Column,Null Count
0,ppage,0
1,ppeduc5,0
2,ppemploy,0
3,ppethm,0
4,ppgender,0
5,pphhsize,0
6,ppmarit5,0
7,ppinc7,0
8,pphouse4,0
9,pprent,0


In [98]:
# Fill nulls in df_relevant: use mode for object columns, mean for numeric columns
for col in df_relevant.columns:
  if df_relevant[col].isnull().any():
    if df_relevant[col].dtype == 'object':
      mode_val = df_relevant[col].mode(dropna=True)
      if not mode_val.empty:
        df_relevant[col] = df_relevant[col].fillna(mode_val[0])
    else:
      median_val = df_relevant[col].median()
      df_relevant[col] = df_relevant[col].fillna(median_val)

In [99]:
# Step 4: Rename columns to more comprehensive names
column_rename_dict = {
    # Demographics
    "ppage": "Age",
    "ppeduc5": "Education_Level",
    "ppemploy": "Employment_Status",
    "ppethm": "Race_Ethnicity",
    "ppgender": "Gender",
    "pphhsize": "Household_Size",
    "ppmarit5": "Marital_Status",
    "ppinc7": "Household_Income",
    "pphouse4": "Housing_Type",
    "ppreg4": "Region",
    "pprent": "Home_Ownership",
    # Financial behavior
    "B2": "Financial_Management",
    "B3": "Financial_Change_12mo",
    "BK1": "Has_Bank_Account",
    "BK2_a": "Used_Money_Order",
    "BK2_b": "Cashed_Check_Outside_Bank",
    "BK2_c": "Used_Payday_Loan",
    "BK2_d": "Used_Pawn_or_Auto_Title_Loan",
    "BK2_f": "Paid_Overdraft_Fee",
    "A0": "Applied_Any_Credit",
    "C2A": "Has_Credit_Card",
    "C3P": "Credit_Card_Payment_Method",
    "C4A": "Credit_Card_Balance_Frequency",
    "EF1": "Has_Emergency_Fund",
    "EF2": "Can_Cover_3mo_Expenses",
    "EF3_a": "Pay_Emergency_Cash",
    "EF3_b": "Pay_Emergency_Card_Full",
    "EF3_c": "Pay_Emergency_Card_OverTime",
    "EF3_d": "Pay_Emergency_Bank_Loan",
    "EF3_e": "Pay_Emergency_Family",
    "EF3_f": "Pay_Emergency_Payday",
    "EF3_g": "Pay_Emergency_Sell_Something",
    "EF3_h": "Cannot_Pay_Emergency",
    "EF5C": "Paid_All_Bills_Last_Month",
    "EF6C_a": "Paid_Rent_Mortgage_Last_Month",
    "EF6C_b": "Paid_Utilities_Last_Month",
    "EF6C_c": "Paid_Phone_Internet_Last_Month",
    "EF6C_d": "Paid_Car_Payment_Last_Month",
    "EF7": "Max_Emergency_Expense_Covered",
    # BNPL
    "BNPL1": "Used_BNPL_Past_Year",
    "BNPL3": "Late_BNPL_Payment",
    "BNPL3A": "Charged_Late_BNPL",
    # Case ID for reference
    "CaseID": "CaseID",
    # Updated BNPL variables
    "BNPL4_a": "BNPL_Reason_Avoid_Interest",
    "BNPL4_b": "BNPL_Reason_Spread_Payments",
    "BNPL4_c": "BNPL_Reason_Fixed_Payments",
    "BNPL4_d": "BNPL_Reason_Convenience",
    "BNPL4_e": "BNPL_Reason_Only_Afford_Way",
    "BNPL4_f": "BNPL_Reason_Only_Method",
    "BNPL4_g": "BNPL_Reason_No_Credit_Card",
    "SL6": "Student_Loan_Delinquent",
    "R11": "Behind_On_Rent",
    "SL1": "Has_Student_Loan",
    "SL3": "Student_Loan_Amount",
    "SL4A": "Student_Loan_Required_Payment",
    "SL4": "Student_Loan_Monthly_Payment",
    "M4": "Mortgage_Payment",
}

df_relevant = df_relevant.rename(columns=column_rename_dict)

# Check the new column names
df_relevant.head()


Unnamed: 0,Age,Education_Level,Employment_Status,Race_Ethnicity,Gender,Household_Size,Marital_Status,Household_Income,Housing_Type,Home_Ownership,...,Pay_Emergency_Family,Pay_Emergency_Payday,Pay_Emergency_Sell_Something,Cannot_Pay_Emergency,Paid_All_Bills_Last_Month,Max_Emergency_Expense_Covered,Used_BNPL_Past_Year,Has_Student_Loan,Mortgage_Payment,CaseID
0,82,Master’s degree or higher,Working part-time,"White, Non-Hispanic",Female,1,Never married,"$25,000 to $49,999",Building with 2 or more apartments,Rented for cash,...,No,No,No,No,Yes,"$2,000 or more",No,No,1500.0,1
1,79,Master’s degree or higher,Not working,"White, Non-Hispanic",Female,1,Never married,"$100,000 to $149,999",A one-family house detached from any other house,Owned or being bought by you or someone in you...,...,No,No,No,No,Yes,"$2,000 or more",No,No,1500.0,2
2,61,Bachelor's degree,Working full-time,"White, Non-Hispanic",Male,1,Divorced,"$50,000 to $74,999",Building with 2 or more apartments,Owned or being bought by you or someone in you...,...,No,No,No,No,Yes,"$1,000 to $1,999",No,No,700.0,3
3,67,High school graduate (high school diploma or t...,Not working,"White, Non-Hispanic",Female,2,Now married,"$75,000 to $99,999",A one-family house detached from any other house,Owned or being bought by you or someone in you...,...,No,No,No,No,Yes,"$2,000 or more",No,No,1500.0,4
4,59,Some college or Associate's degree,Not working,"White, Non-Hispanic",Female,2,Now married,"$50,000 to $74,999",A one-family house detached from any other house,Owned or being bought by you or someone in you...,...,No,No,No,No,Yes,Under $100,No,No,2000.0,5


In [101]:
# import os

# # 💾 Save cleaned CSV for use in MS4
# output_dir = "../1_datasets/processed_datasets"
# os.makedirs(output_dir, exist_ok=True)
# df_relevant.to_csv(os.path.join(output_dir, "public2024.csv"), index=False)

# print("✅ Cleaned file saved in /1_datasets/processed_datasets/")


In [102]:
# # 💾 Save cleaned CSV for use in MS4
# df_relevant.to_csv("../1_datasets/processed_datasets/public2024.csv", index=False)

# print("✅ Cleaned file saved in /1_datasets/processed_datasets/")
