# Cleaning SCE Credit Access Microdata (2013–2017)

This notebook cleans and prepares the **Survey of Consumer Expectations 
(SCE) Credit Access** microdata from the Federal Reserve Bank of New York 
for analysis.

## Relevance to Our Project
This dataset is highly relevant for modeling credit access patterns, 
credit usage behavior, and the borrowing intentions of individuals 
from 2013 to 2017. It helps us understand borrower characteristics, 
credit barriers, and loan outcomes — critical for assessing **Buy Now 
Pay Later (BNPL) over-indebtedness risk** among Gen Z and other consumers.

---

In [2]:
# Import necessary libraries
import pandas as pd

In [3]:
# Define file path
RAW_DATA_PATH = "../1_datasets/raw_data/FRBNY-SCE-Credit-Access-complete_microdata.xlsx"

# Load the raw Excel data (no header)
df = pd.read_excel(
    RAW_DATA_PATH,
    sheet_name="Data",
    header=None,
)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
0,"Source: Survey of Consumer Expectations, © 20...",,,,,,,,,,...,,,,,,,,,,
1,userid,date,weight,N1_1,N1_2,N1_3,N1_4,N1_5,N1_6,N1_7,...,N21_2,N21_3,N21_4,N21_5,N21_6,N21_7,N22,N23,N24,N25
2,70000337,201310,5.382,0,0,1,0,0,0,0,...,,,,,,,,,,
3,70000341,201310,0.557,1,1,1,0,1,0,0,...,75,90,95,,85,98,,,,
4,70003202,201310,0.868,0,1,0,0,0,0,0,...,60,95,,,95,,,,,


In [5]:
# Extract column names from the second row (index 1)
column_names = df.iloc[1].tolist()
# Extract the actual data, starting from the third row
survey_data = df.iloc[2:].reset_index(drop=True)
survey_data.columns = column_names

# Remove completely empty rows and columns
survey_data = survey_data.dropna(how="all")
survey_data = survey_data.dropna(axis=1, how="all")

# Remove duplicate rows
survey_data = survey_data.drop_duplicates()

# Clean column names: replace spaces with underscores, remove special characters, lowercase
cleaned_column_names = []
for col in survey_data.columns:
    if pd.isna(col):
        cleaned_column_names.append("unnamed_column")
    else:
        col_str = str(col).strip().replace(" ", "_").replace("-", "_")
        col_str = "".join(c for c in col_str if c.isalnum() or c == "_")
        if col_str and not col_str[0].isalpha():
            col_str = "col_" + col_str
        cleaned_column_names.append(col_str.lower())
survey_data.columns = cleaned_column_names

In [6]:
survey_data.head()

Unnamed: 0,userid,date,weight,n1_1,n1_2,n1_3,n1_4,n1_5,n1_6,n1_7,...,n21_2,n21_3,n21_4,n21_5,n21_6,n21_7,n22,n23,n24,n25
0,70000337,201310,5.382,0,0,1,0,0,0,0,...,,,,,,,,,,
1,70000341,201310,0.557,1,1,1,0,1,0,0,...,75.0,90.0,95.0,,85.0,98.0,,,,
2,70003202,201310,0.868,0,1,0,0,0,0,0,...,60.0,95.0,,,95.0,,,,,
3,70003205,201310,0.422,1,1,0,0,0,0,0,...,,,,,,,,,,
4,70003238,201310,0.638,0,1,0,0,0,0,0,...,100.0,100.0,,,100.0,,,,,


In [9]:
# Ensure 'date' is a string
survey_data["date"] = survey_data["date"].astype(str)

# Extract year and month
survey_data["Year"] = survey_data["date"].str[:4].astype(int)
survey_data["Month"] = survey_data["date"].str[4:6].astype(int)

# Move 'Year' and 'Month' to the front for clarity
cols = ["Year", "Month"] + [
    col for col in survey_data.columns if col not in ["Year", "Month"]
]
survey_data = survey_data[cols]

survey_data.head()

Unnamed: 0,Year,Month,userid,date,weight,n1_1,n1_2,n1_3,n1_4,n1_5,...,n21_2,n21_3,n21_4,n21_5,n21_6,n21_7,n22,n23,n24,n25
0,2013,10,70000337,201310,5.382,0,0,1,0,0,...,,,,,,,,,,
1,2013,10,70000341,201310,0.557,1,1,1,0,1,...,75.0,90.0,95.0,,85.0,98.0,,,,
2,2013,10,70003202,201310,0.868,0,1,0,0,0,...,60.0,95.0,,,95.0,,,,,
3,2013,10,70003205,201310,0.422,1,1,0,0,0,...,,,,,,,,,,
4,2013,10,70003238,201310,0.638,0,1,0,0,0,...,100.0,100.0,,,100.0,,,,,


In [10]:
# Rename columns using the provided mapping
column_rename_map = {
    "userid": "respondent_id",
    "date": "survey_date",
    "weight": "sampling_weight",
    "n1_1": "has_credit_card",
    "n1_2": "has_mortgage",
    "n1_3": "has_student_loan",
    "n1_4": "has_home_based_loan",
    "n1_5": "has_auto_loan",
    "n1_6": "has_other_personal_loan",
    "n1_7": "has_no_credit_products",
    "n2_1": "balance_credit_card_usd",
    "n2_2": "balance_mortgage_usd",
    "n2_3": "balance_student_loan_usd",
    "n2_4": "balance_home_loan_usd",
    "n2_5": "balance_auto_loan_usd",
    "n2_6": "balance_other_loans_usd",
    "n2b_1": "balance_credit_card_category",
    "n2b_2": "balance_mortgage_category",
    "n2b_3": "balance_student_loan_category",
    "n2b_4": "balance_home_loan_category",
    "n2b_5": "balance_auto_loan_category",
    "n2b_6": "balance_other_loan_category",
    "n3": "maxed_out_credit_card_past_year",
    "n4_1": "applied_credit_card_past_year",
    "n4_2": "applied_mortgage_past_year",
    "n4_3": "applied_auto_loan_past_year",
    "n4_4": "requested_credit_card_limit_increase",
    "n4_5": "requested_existing_loan_limit_increase",
    "n4_6": "requested_mortgage_refinance",
    "n4_7": "applied_student_loan_past_year",
    "n5_1": "did_not_apply_satisfied_financial",
    "n5_2": "did_not_apply_time_consuming",
    "n5_3": "did_not_apply_rates_too_high",
    "n5_4": "did_not_apply_dont_know_how",
    "n5_5": "did_not_apply_expected_denial",
    "n6_1": "did_not_apply_credit_card_expected_denial",
    "n6_2": "did_not_apply_mortgage_expected_denial",
    "n6_3": "did_not_apply_auto_loan_expected_denial",
    "n6_4": "did_not_apply_cc_limit_increase_expected_denial",
    "n6_5": "did_not_apply_loan_limit_increase_expected_denial",
    "n6_6": "did_not_apply_refinance_expected_denial",
    "n6_7": "did_not_apply_student_loan_expected_denial",
    "n6_8": "none_n6_options_applicable",
    "n7_1": "redundant_did_not_apply_cc_expected_denial",
    "n7_2": "redundant_did_not_apply_mortgage_expected_denial",
    "n7_3": "redundant_did_not_apply_auto_loan_expected_denial",
    "n7_4": "redundant_did_not_apply_cc_limit_increase_expected_denial",
    "n7_5": "redundant_did_not_apply_loan_limit_increase_expected_denial",
    "n7_6": "redundant_did_not_apply_refinance_expected_denial",
    "n7_7": "redundant_did_not_apply_student_loan_expected_denial",
}
survey_data = survey_data.rename(columns=column_rename_map)

In [11]:
# Keep only columns with no missing values
complete_columns = [
    col for col in survey_data.columns if survey_data[col].isnull().sum() == 0
]
complete_cols_data = survey_data[complete_columns]

print(f"Shape with only complete columns: {complete_cols_data.shape}")

# Optionally, drop rows that are completely empty (shouldn't be any)
complete_cols_data = complete_cols_data.dropna(how="all")

Shape with only complete columns: (34362, 14)


In [12]:
complete_cols_data.head()

Unnamed: 0,Year,Month,respondent_id,survey_date,has_credit_card,has_mortgage,has_student_loan,has_home_based_loan,has_auto_loan,has_other_personal_loan,has_no_credit_products,n14_1,n14_2,n14_3
0,2013,10,70000337,201310,0,0,1,0,0,0,0,0,1,0
1,2013,10,70000341,201310,1,1,1,0,1,0,0,0,0,1
2,2013,10,70003202,201310,0,1,0,0,0,0,0,0,0,1
3,2013,10,70003205,201310,1,1,0,0,0,0,0,0,0,1
4,2013,10,70003238,201310,0,1,0,0,0,0,0,0,0,1


## Save Cleaned Data

We now export the cleaned dataset to the 
`/1_datasets/processed_datasets` folder. This cleaned version will be 
used in Milestone 4 for further exploration and 
comparative modeling.

In [13]:
# Save to a reference subfolder
complete_cols_data.to_csv(
    "../1_datasets/processed_data/FRBNY_SCE_Credit_Access_cleaned.csv", index=False
)

print("Saved cleaned dataset.")

Saved cleaned dataset.
