In [9]:
import pandas as pd
import zipfile

# Path to your ZIP file
zip_path = 'input_data1/hmda_2007_nationwide_all-records_labels.zip'

# Columns to keep (based on what exists in 2007 data)
keep_cols = [
    'state_code',
    'county_code',
    'census_tract_number',
    'loan_amount_000s',
    'loan_purpose_name',
    'owner_occupancy_name',
    'action_taken_name',
    'applicant_income_000s',
    'property_type_name'
]

# List to hold processed chunks
chunks = []

# Open and process file
with zipfile.ZipFile(zip_path) as z:
    with z.open('hmda_2007_nationwide_all-records_labels.csv') as f:
        for chunk in pd.read_csv(f, chunksize=500_000, low_memory=False):
            # Filter to only needed columns
            filtered = chunk[keep_cols]
            chunks.append(filtered)

# Combine all chunks into one DataFrame
hmda_filtered = pd.concat(chunks)

# Save the cleaned data
hmda_filtered.to_csv('cleaned_data/hmda_2007_filtered.csv', index=False)

print("Cleaned file saved to 'cleaned_data/hmda_2007_filtered.csv'")


Cleaned file saved to 'cleaned_data/hmda_2007_filtered.csv'


In [18]:
# Path to your ZIP file
zip_path = 'input_data1/hmda_2007_nationwide_all-records_labels.zip'

# Open ZIP and preview columns in the CSV
with zipfile.ZipFile(zip_path) as z:
    # Print all files inside (there should be just one CSV)
    print("Files in zip:", z.namelist())

    # Open the CSV file inside the zip
    with z.open(z.namelist()[0]) as f:
        df_sample = pd.read_csv(f, nrows=5)
        print("\nColumn Names:\n")
        for col in df_sample.columns:
            print(col)

Files in zip: ['hmda_2007_nationwide_all-records_labels.csv']

Column Names:

as_of_year
respondent_id
agency_name
agency_abbr
agency_code
loan_type_name
loan_type
property_type_name
property_type
loan_purpose_name
loan_purpose
owner_occupancy_name
owner_occupancy
loan_amount_000s
preapproval_name
preapproval
action_taken_name
action_taken
msamd_name
msamd
state_name
state_abbr
state_code
county_name
county_code
census_tract_number
applicant_ethnicity_name
applicant_ethnicity
co_applicant_ethnicity_name
co_applicant_ethnicity
applicant_race_name_1
applicant_race_1
applicant_race_name_2
applicant_race_2
applicant_race_name_3
applicant_race_3
applicant_race_name_4
applicant_race_4
applicant_race_name_5
applicant_race_5
co_applicant_race_name_1
co_applicant_race_1
co_applicant_race_name_2
co_applicant_race_2
co_applicant_race_name_3
co_applicant_race_3
co_applicant_race_name_4
co_applicant_race_4
co_applicant_race_name_5
co_applicant_race_5
applicant_sex_name
applicant_sex
co_applicant_se

In [20]:
# Path to your ZIP file
zip_path = 'input_data1/2020_Gaz_tracts_national.zip'

# Open ZIP and preview columns in the TXT inside
with zipfile.ZipFile(zip_path) as z:
    # List files inside ZIP
    print("Files in zip:", z.namelist())

    # Open the TXT file inside (there should be just one)
    with z.open(z.namelist()[0]) as f:
        df_sample = pd.read_csv(f, delimiter='\t', nrows=5)
        print("\nColumn Names:\n")
        for col in df_sample.columns:
            print(col)

Files in zip: ['2020_Gaz_tracts_national.txt']

Column Names:

USPS
GEOID
ALAND
AWATER
ALAND_SQMI
AWATER_SQMI
INTPTLAT
INTPTLONG                                                                                                                             


In [None]:
import pandas as pd
import zipfile

# Step 1: Load the Gazetteer tract file from ZIP
zip_path = 'input_data1/2020_Gaz_tracts_national.zip'
with zipfile.ZipFile(zip_path) as z:
    with z.open('2020_Gaz_tracts_national.txt') as f:
        tracts = pd.read_csv(f, delimiter='\t')
        tracts.columns = tracts.columns.str.strip()
        tracts = tracts[['GEOID', 'INTPTLAT', 'INTPTLONG']]
        tracts.rename(columns={
            'GEOID': 'census_tract_number',
            'INTPTLAT': 'latitude',
            'INTPTLONG': 'longitude'
        }, inplace=True)

# Step 2: Load your cleaned HMDA data
hmda = pd.read_csv('cleaned_data/hmda_2007_filtered.csv')

# Step 3: Format tract numbers to match
tracts['census_tract_number'] = tracts['census_tract_number'].astype(str)
hmda['census_tract_number'] = hmda['census_tract_number'].astype(str).str.zfill(11)

# Step 4: Merge HMDA + lat/long
merged = hmda.merge(tracts, on='census_tract_number', how='left')

# Step 5: Save final merged dataset
merged.to_csv('cleaned_data/hmda_2007_with_latlong.csv', index=False)

print("Merged file saved as: cleaned_data/hmda_2007_with_latlong.csv")