In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import us
import os
import re
from census import Census
import time
import sys

In [None]:
# -----------------------------------------------------------
# PHASE 0: LOAD THE CITIES FILE
# -----------------------------------------------------------

df = pd.read_csv("cities.csv")

# Split "City,State" field into separate columns
new_cities = []
new_states = []

for index, row in df.iterrows():
    full_string = row['City']
    if ',' in full_string:
        parts = full_string.split(',', 1)
        city = parts[0].strip()
        state = parts[1].strip()
        new_cities.append(city)
        new_states.append(state)

df['City'] = new_cities
df['State'] = new_states

df.head(5)

In [None]:
# -----------------------------------------------------------
# PHASE 1: FIPS MERGE SETUP
# -----------------------------------------------------------

CENSUS_API_KEY = "18cce41aaa77c3804c8bb25d7aa089b95749f551"
VINTAGE_YEAR = 2023

GAZETTEER_FILE_PATH = "2024_Gaz_place_national.txt"

# Load Census client
c = Census(CENSUS_API_KEY, year=VINTAGE_YEAR)

def get_fips_lookup_df(file_path):
    """Load the 2024 Gazetteer Place file correctly (it contains headers)."""

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Gazetteer file NOT found at: {file_path}")

    # Read file
    fips_df = pd.read_csv(file_path, sep="\t", dtype=str)

    # Extract the 5-digit place FIPS (last 5 digits of GEOID)
    fips_df["Place_FIPS"] = fips_df["GEOID"].str[-5:].str.zfill(5)

    # Extract the two-digit state FIPS (first 2 digits)
    fips_df["State_FIPS"] = fips_df["GEOID"].str[:2].str.zfill(2)

    # Clean NAME: remove city/town/CDP suffixes
    fips_df["Clean_Name"] = (
        fips_df["NAME"]
        .str.replace(r"\s+(city|town|CDP)$", "", regex=True)
        .str.strip()
        .str.lower()
    )

    # Build merge key: city lowercase + state abbr
    fips_df["Merge_Key"] = fips_df["Clean_Name"] + "," + fips_df["USPS"]

    return fips_df[["Merge_Key", "State_FIPS", "Place_FIPS", "NAME"]]

# Load lookup table
fips_lookup_df = get_fips_lookup_df(GAZETTEER_FILE_PATH)

# Prepare city DF for merge
df["Merge_Key"] = df["City"].str.lower() + "," + df["State"]

# Merge
df_merged = df.merge(
    fips_lookup_df,
    on="Merge_Key",
    how="left"
)

# Summary
print("--------------------------------------------------")
print("MERGE SUMMARY")
print("--------------------------------------------------")
print(f"Total cities: {len(df_merged)}")
print(f"Matched to FIPS: {df_merged['Place_FIPS'].notnull().sum()}")
print(f"Missing FIPS: {df_merged['Place_FIPS'].isnull().sum()}")
print("--------------------------------------------------")

df_data_ready = df_merged.dropna(subset=["Place_FIPS"]).copy()
print("\nData is ready for Census API queries!")

In [None]:
# -----------------------------------------------------------
# PHASE 2: ROBUST CENSUS DATA COLLECTION
# -----------------------------------------------------------

# Make sure these variables are defined in this scope
CENSUS_VARIABLES = [
    "B01003_001E",      # Total Population (Estimate)
    "B19013_001E",      # Median Household Income (Estimate)
    "B25077_001E",      # Median Value (Median Housing Price) - TARGET
    "B25003_002E",      # Owner-Occupied Housing Units (Count)
    "B15003_022E",      # Population with Bachelor's Degree
    "B15003_023E",      # Population with Master's Degree
    "B23025_005E",      # Unemployed population (Estimate)
]

VARIABLE_NAMES = [
    'Total_Population', 'Median_Household_Income', 'Median_Housing_Value', 
    'Owner_Occupied_Units', 'Bachelors_Degree_Count', 'Masters_Degree_Count', 
    'Unemployed_Count'
]

city_demographics = []
print(f"\nStarting robust Census data collection for {len(df_data_ready)} matched cities...")

for index, row in df_data_ready.iterrows():
    city_name = row['City']
    state_abbr = row['State']
    state_fips = row['State_FIPS']
    place_fips = row['Place_FIPS']
    
    if not state_fips or not place_fips:
        continue

    try:
        city_data = c.acs5.get(
            CENSUS_VARIABLES,
            {'for': f'place:{place_fips}', 'in': f'state:{state_fips}'}
        )

        if city_data and isinstance(city_data, list) and isinstance(city_data[0], dict):
            record = {'City': city_name, 'State': state_abbr}
            for i, var_code in enumerate(CENSUS_VARIABLES):
                record[VARIABLE_NAMES[i]] = city_data[0].get(var_code, None)
            city_demographics.append(record)
        else:
            print(f"Warning: Unexpected API response for {city_name}, {state_abbr}. Skipping.")

        time.sleep(0.5)

    except Exception as e:
        print(f"Critical API error for {city_name}, {state_abbr}: {type(e).__name__}: {e}")
        time.sleep(2)

demographics_df = pd.DataFrame(city_demographics)
print(f"\nSuccessfully collected demographic data for {len(demographics_df)} cities.")

In [None]:
# -----------------------------------------------------------
# PHASE 3: FINAL DATAFRAME MERGE AND CLEANUP
# -----------------------------------------------------------

# 1. Merge the new demographic data back onto the original merged DataFrame
df_final = df_merged.merge(demographics_df, on=['City', 'State'], how='left')

# 2. Convert all measure columns to numeric
for col in VARIABLE_NAMES:
    df_final[col] = pd.to_numeric(df_final[col], errors='coerce')

# 3. Calculate Derived Features
# Unemployment Rate = Unemployed / Total Population
df_final['Unemployment_Rate'] = df_final['Unemployed_Count'] / df_final['Total_Population']

# Educational Attainment = Bachelors or higher / Total Population
df_final['Bachelors_Or_Higher_Rate'] = (
    df_final['Bachelors_Degree_Count'] + df_final['Masters_Degree_Count']
) / df_final['Total_Population']

# 4. Quick Summary
print("\n--- FINAL CENSUS DATA COLLECTION COMPLETE ---")
print(f"Final DataFrame shape: {df_final.shape}. It now contains all original and feature columns.")
print("Feature columns added:", VARIABLE_NAMES + ['Unemployment_Rate', 'Bachelors_Or_Higher_Rate'])

# 5. Show a few key columns for verification
print("\nSample of final DataFrame (key features):")
print(df_final[['City', 'State', 'Median_Housing_Value', 'Median_Household_Income', 
                'Unemployment_Rate', 'Bachelors_Or_Higher_Rate']].head())

In [None]:
df_final.head(5)

In [None]:
df_final.to_csv('census2023-2.csv', index=False) 