# Bus Need Classifier: Frontend_Data_Creation_V2

*Updates from V1: Condenses all the code into one block for easy copy-paste; properly generates one pandas dataframe row with 7 columns for a home-school address pair.*

This notebook experiments with APIs to figure out how to gather data to populate all columns of an entry based on the home address and school address.

Coordinates from address.

In [23]:
import requests

def geocode_address_census(address):
    url = "https://geocoding.geo.census.gov/geocoder/locations/onelineaddress"

    params = {
        "address": address,
        "benchmark": "Public_AR_Current",
        "format": "json"
    }

    response = requests.get(url, params=params).json()
    matches = response["result"]["addressMatches"]

    if len(matches) == 0:
        raise ValueError("Address not found.")

    coords = matches[0]["coordinates"]
    return coords["y"], coords["x"]   # (lat, lon)


State, county, tract, block group identifiers based on coordinates.

In [None]:
def get_fips_from_coords(lat, lon):
    """
    Given lat/lon, return:
    - state FIPS
    - county FIPS
    - tract code
    - block group code
    """
    url = f"https://geo.fcc.gov/api/census/block/find?latitude={lat}&longitude={lon}&format=json"
    response = requests.get(url).json()

    block_fips = response["Block"]["FIPS"]  # 15-digit block code

    state_fips = block_fips[:2]       # 2 digits
    county_fips = block_fips[2:5]     # 3 digits
    tract = block_fips[5:11]          # 6 digits
    block = block_fips[11:]           # 4 digits
    block_group = block[0]            # 1 digit

    return state_fips, county_fips, tract, block_group


Census division.

In [None]:
STATE_TO_DIVISION = {
    # New England
    "09":"New England","23":"New England","25":"New England","33":"New England","44":"New England","50":"New England",

    # Middle Atlantic
    "34":"Middle Atlantic","36":"Middle Atlantic","42":"Middle Atlantic",

    # East North Central
    "17":"East North Central","18":"East North Central","26":"East North Central","39":"East North Central","55":"East North Central",

    # West North Central
    "19":"West North Central","20":"West North Central","27":"West North Central","29":"West North Central",
    "31":"West North Central","38":"West North Central","46":"West North Central",

    # South Atlantic
    "10":"South Atlantic","11":"South Atlantic","12":"South Atlantic","13":"South Atlantic",
    "24":"South Atlantic","37":"South Atlantic","45":"South Atlantic","51":"South Atlantic","54":"South Atlantic",

    # East South Central
    "01":"East South Central","21":"East South Central","28":"East South Central","47":"East South Central",

    # West South Central
    "05":"West South Central","22":"West South Central","40":"West South Central","48":"West South Central",

    # Mountain
    "04":"Mountain","08":"Mountain","16":"Mountain","30":"Mountain",
    "32":"Mountain","35":"Mountain","49":"Mountain","56":"Mountain",

    # Pacific
    "02":"Pacific","06":"Pacific","15":"Pacific","41":"Pacific","53":"Pacific"
}

def get_division(state_fips):
  return STATE_TO_DIVISION[state_fips]

Median income by block group.

In [None]:
def get_blockgroup_median_income(state_fips, county_fips, tract, block_group, api_key=None):
    """
    Get median household income for a block group from ACS5 (2022).

    Parameters:
        state_fips (str): 2-digit state FIPS
        county_fips (str): 3-digit county FIPS
        tract (str): 6-digit census tract
        block_group (str): 1-digit block group number
        api_key (str, optional): Your Census API key. Default None.

    Returns:
        int: median household income estimate
    """
    base_url = "https://api.census.gov/data/2022/acs/acs5"
    var = "B19013_001E"  # median household income estimate
    url = f"{base_url}?get={var}&for=block group:{block_group}&in=state:{state_fips}+county:{county_fips}+tract:{tract}"

    if api_key:
        url += f"&key={api_key}"

    response = requests.get(url)
    response.raise_for_status()  # raise exception if bad response

    data = response.json()
    # data[0] is header, data[1] is the actual value
    median_income = int(data[1][0])

    if median_income in [None, "", "null"]:
      median_income = "No data"  # or some default value

    return median_income


Race and hisp classification by block group.

In [None]:
def get_race_hisp_counts(state_fips, county_fips, tract, block_group, api_key=None):
    """
    Get ACS5 race and Hispanic counts for a block group.
    """
    # ACS5 table variables
    race_vars = [
        "B02001_002E",  # White
        "B02001_003E",  # Black or African American
        "B02001_004E",  # Asian
        "B02001_005E",  # American Indian or Alaska Native
        "B02001_006E",   # Native Hawaiian or other Pacific Islander
        "B02001_007E",   # One other race
        "B02001_008E",   # Two+ races

    ]
    hisp_vars = [
        "B03003_002E",  # Not Hispanic
        "B03003_003E"   # Hispanic
    ]

    all_vars = race_vars + hisp_vars
    var_str = ",".join(all_vars)

    url = f"https://api.census.gov/data/2022/acs/acs5"
    params = {
        "get": var_str,
        "for": f"block group:{block_group}",
        "in": f"state:{state_fips}+county:{county_fips}+tract:{tract}",
        "key": api_key
    }

    response = requests.get(url, params=params)
    data = response.json()

    # First row is headers, second row is values
    counts = dict(zip(data[0], data[1]))
    # Convert string counts to int
    counts = {k: int(v) for k, v in counts.items()}

    return counts


MSA status and code.

In [91]:
import pandas as pd

def get_MSA_status(state_fips, county_fips):
  url = "https://www2.census.gov/programs-surveys/metro-micro/geographies/reference-files/2023/delineation-files/list1_2023.xlsx"
  cbsa_crosswalk = pd.read_excel(url, dtype=str,header=2)
  val = cbsa_crosswalk[(cbsa_crosswalk["FIPS State Code"] == state_fips) & (cbsa_crosswalk["FIPS County Code"] == county_fips)]
  return val["CBSA Code"].tolist()[0], val["Metropolitan/Micropolitan Statistical Area"].tolist()[0]


MSA Size

In [None]:
def get_population_by_cbsa(cbsa_code):
    """
    Given a CBSA code, fetch total population (B01003) from ACS5 API.
    Returns the population as an integer.
    """
    base_url = "https://api.census.gov/data/2022/acs/acs5"
    params = {
        "get": "B01003_001E",  # Total population estimate
        "for": f"metropolitan statistical area/micropolitan statistical area:{cbsa_code}",
        #"key": CENSUS_API_KEY
    }

    response = requests.get(base_url, params=params)
    response.raise_for_status()  # Raise an error if the request failed

    data = response.json()
    # The first row is column names, second row is the values
    pop = int(data[1][0])
    return pop


Urban/Rural classification by coordinates.

In [None]:
import geopandas as gpd
import requests
import zipfile
import io
from shapely.geometry import Point

# ----------------------------------------------------
# Configuration: URL of shapefile ZIP
UAC20_URL = "https://www2.census.gov/geo/tiger/TIGER2020/UAC/tl_2020_us_uac20.zip"

# ----------------------------------------------------
# Helper to load shapefile into GeoDataFrame
def load_urban_areas_gdf(url=UAC20_URL):
    # Download zip into bytes
    r = requests.get(url)
    r.raise_for_status()
    z = zipfile.ZipFile(io.BytesIO(r.content))

    # Find the .shp file name inside the zip
    shapefile_name = [f for f in z.namelist() if f.endswith(".shp")][0]

    # Extract all files into memory buffer
    z.extractall("/tmp/tl_uac20")

    # Load with GeoPandas
    gdf = gpd.read_file(f"/tmp/tl_uac20/{shapefile_name}")
    # Ensure it's in WGS84 lat/lon
    gdf = gdf.to_crs(epsg=4326)
    return gdf

# Load once
urban_gdf = load_urban_areas_gdf()

# ----------------------------------------------------
def classify_urban(lat, lon, gdf=urban_gdf):
    """
    Returns:
      - 'Urban' if the point is inside any urban polygon
      - 'Rural' otherwise
      - urban area name if inside urban, else None
    """
    pt = Point(lon, lat)
    match = gdf[gdf.contains(pt)]
    if not match.empty:
        # Inside some urban polygon
        name = match.iloc[0]["NAME20"]
        return "Urban", name
    else:
        return "Rural", None  # Not in urban area

Miles to school.

In [26]:

# Get a free key from https://openrouteservice.org/sign-up/
API_KEY = "eyJvcmciOiI1YjNjZTM1OTc4NTExMTAwMDFjZjYyNDgiLCJpZCI6ImFiMTU3YmFjMzYxNzQ3MGRhZGY5ZWQ4MTFmOTE0ZGZiIiwiaCI6Im11cm11cjY0In0="

def get_driving_distance_ors(address1, address2):
    """
    Returns driving distance in kilometers and duration in minutes using OpenRouteService.
    """
    # First, geocode addresses using ORS
    def geocode(address):
        url = "https://api.openrouteservice.org/geocode/search"
        params = {"api_key": API_KEY, "text": address, "size": 1}
        resp = requests.get(url, params=params).json()
        if len(resp["features"]) == 0:
            raise ValueError(f"Address not found: {address}")
        coords = resp["features"][0]["geometry"]["coordinates"]  # [lon, lat]
        return coords

    start_coords = geocode(address1)
    end_coords = geocode(address2)

    # Call directions endpoint
    url = "https://api.openrouteservice.org/v2/directions/driving-car"
    headers = {"Authorization": API_KEY, "Content-Type": "application/json"}
    body = {
        "coordinates": [start_coords, end_coords]
    }
    resp = requests.post(url, json=body, headers=headers).json()
    route = resp["routes"][0]["summary"]
    distance_mi = route["distance"] / 1000 * 0.621371
    duration_min = route["duration"] / 60

    return distance_mi, duration_min

We need to minimize our API calls. We can combine our three calls to the Census API into two calls and keep everything else the same.

In [95]:
def get_census_data(state_fips, county_fips, tract, block_group, api_key=None):
    """
    Fetch block group median income, race/Hispanic counts, and CBSA total population
    in one function call.

    Parameters:
        state_fips (str): 2-digit state FIPS
        county_fips (str): 3-digit county FIPS
        tract (str): 6-digit tract code
        block_group (str): 1-digit block group code
        cbsa_code (str): CBSA code for MSA
        api_key (str, optional): Your Census API key

    Returns:
        dict: {
            "median_income": int or "No data",
            "race_counts": dict of race/Hispanic counts,
            "cbsa_population": int
        }
    """
    # ---------- Block group: median income + race/Hispanic ----------
    base_bg = "https://api.census.gov/data/2022/acs/acs5"

    race_vars = [
        "B02001_002E",  # White
        "B02001_003E",  # Black or African American
        "B02001_004E",  # Asian
        "B02001_005E",  # American Indian or Alaska Native
        "B02001_006E",  # Native Hawaiian or other Pacific Islander
        "B02001_007E",  # Other race
        "B02001_008E",  # Two or more races
    ]
    hisp_vars = [
        "B03003_002E",  # Not Hispanic
        "B03003_003E"   # Hispanic
    ]

    all_vars = ["B19013_001E"] + race_vars + hisp_vars
    var_str = ",".join(all_vars)

    params_bg = {
        "get": var_str,
        "for": f"block group:{block_group}",
        "in": f"state:{state_fips}+county:{county_fips}+tract:{tract}"
    }
    if api_key:
        params_bg["key"] = api_key

    response_bg = requests.get(base_bg, params=params_bg)
    response_bg.raise_for_status()
    data_bg = response_bg.json()

    # Median income
    median_income_raw = data_bg[1][0]
    median_income = int(median_income_raw) if median_income_raw not in [None, "", "null"] else "No data"

    # Race/Hispanic counts
    counts_raw = dict(zip(data_bg[0][1:], data_bg[1][1:]))  # skip median income key
    race_counts = {k: int(v) for k, v in counts_raw.items()}

    # ---------- CBSA population ----------
    cbsa_code, msa_status = get_MSA_status(state_fips, county_fips)
    if "Micropolitan" in msa_status:
      return {
        "median_income": median_income,
        "race_counts": race_counts,
        "cbsa_population": "Not in MSA/CMSA"
      }

    base_cbsa = "https://api.census.gov/data/2022/acs/acs5"
    params_cbsa = {
        "get": "B01003_001E",
        "for": f"metropolitan statistical area/micropolitan statistical area:{cbsa_code}"
    }
    if api_key:
        params_cbsa["key"] = api_key

    response_cbsa = requests.get(base_cbsa, params=params_cbsa)
    response_cbsa.raise_for_status()
    data_cbsa = response_cbsa.json()
    cbsa_population = int(data_cbsa[1][0])

    # ---------- Combine results ----------
    return {
        "median_income": median_income,
        "race_counts": race_counts,
        "cbsa_population": cbsa_population
    }



{'median_income': 204552, 'race_counts': {'B02001_002E': 1028, 'B02001_003E': 70, 'B02001_004E': 32, 'B02001_005E': 394, 'B02001_006E': 0, 'B02001_007E': 82, 'B02001_008E': 374, 'B03003_002E': 1459, 'B03003_003E': 521, 'state': 6, 'county': 13, 'tract': 345101, 'block group': 2}, 'cbsa_population': 4692242}


Now we have everything we need. Let's create some logic to map everything properly. Our columns are ['SCHOOL_DISTANCE', 'CENSUS_REGION', 'HHFAMINC',
       'HH_RACE', 'HH_HISP', 'MSASIZE', 'URBRUR']. We need to map all of our API call results to the right values in these columns.

In [98]:
def vector_generator(home_address, school_address):
  final_vector = {'SCHOOL_DISTANCE':None, 'CENSUS_REGION':None, 'HHFAMINC':None, 'HH_RACE':None, 'HH_HISP':None, 'MSASIZE':None, 'URBRUR':None}

  lat, lon = geocode_address_census(home_address)
  state_fips, county_fips, tract, block_group = get_fips_from_coords(lat,lon)

  #school distance
  mi, min = get_driving_distance_ors(home_address, school_address)
  final_vector["SCHOOL_DISTANCE"] = mi

  #census region
  final_vector["CENSUS_REGION"] = STATE_TO_DIVISION[state_fips]

  #race, hisp, and median income
  census_data = get_census_data(state_fips, county_fips, tract, block_group)

  #race data
  race_counts = census_data['race_counts']
  race_mapping = {
        "White": race_counts.get("B02001_002E", 0),
        "Black or African American": race_counts.get("B02001_003E", 0),
        "Asian": race_counts.get("B02001_004E", 0),
        "American Indian or Alaska Native": race_counts.get("B02001_005E", 0),
        "Native Hawaiian or other Pacific Islander": race_counts.get("B02001_006E", 0),
        "Other": race_counts.get("B02001_007E", 0) + race_counts.get("B02001_008E", 0)
  }
  # Choose the race with the highest count
  final_vector["HH_RACE"] = max(race_mapping, key=race_mapping.get) if race_mapping else "No data"

  #hisp data
  # 003E = Hispanic, 002E = Not Hispanic
  hisp_counts = race_counts
  if hisp_counts.get("B03003_003E", 0) > hisp_counts.get("B03003_002E", 0):
      final_vector["HH_HISP"] = "Yes"
  elif hisp_counts.get("B03003_003E", 0) <= hisp_counts.get("B03003_002E", 0):
      final_vector["HH_HISP"] = "No"
  else:
      final_vector["HH_HISP"] = "No data"

  #median income
  median_income = census_data['median_income']
  income_bins = [
        "Less than $10,000",
        "$10,000 to $14,999",
        "$15,000 to $24,999",
        "$25,000 to $34,999",
        "$35,000 to $49,999",
        "$50,000 to $74,999",
        "$75,000 to $99,999",
        "$100,000 to $124,999",
        "$125,000 to $149,999",
        "$150,000 to $199,999",
        "$200,000 or more"
  ]

  final_vector["HHFAMINC"] = "No data"
  if isinstance(median_income, int):
      if median_income < 10000:
          final_vector["HHFAMINC"] = "Less than $10,000"
      elif 10000 <= median_income <= 14999:
          final_vector["HHFAMINC"] = "$10,000 to $14,999"
      elif 15000 <= median_income <= 24999:
          final_vector["HHFAMINC"] = "$15,000 to $24,999"
      elif 25000 <= median_income <= 34999:
          final_vector["HHFAMINC"] = "$25,000 to $34,999"
      elif 35000 <= median_income <= 49999:
          final_vector["HHFAMINC"] = "$35,000 to $49,999"
      elif 50000 <= median_income <= 74999:
          final_vector["HHFAMINC"] = "$50,000 to $74,999"
      elif 75000 <= median_income <= 99999:
          final_vector["HHFAMINC"] = "$75,000 to $99,999"
      elif 100000 <= median_income <= 124999:
          final_vector["HHFAMINC"] = "$100,000 to $124,999"
      elif 125000 <= median_income <= 149999:
          final_vector["HHFAMINC"] = "$125,000 to $149,999"
      elif 150000 <= median_income <= 199999:
          final_vector["HHFAMINC"] = "$150,000 to $199,999"
      elif median_income >= 200000:
          final_vector["HHFAMINC"] = "$200,000 or more"

  #MSA
  cbsa_population = census_data['cbsa_population']
  MSASIZE = "Not in MSA or CMSA"
  if isinstance(cbsa_population, int):
      if cbsa_population < 250000:
          MSASIZE = "In an MSA of Less than 250,000"
      elif 250000 <= cbsa_population <= 499999:
          MSASIZE = "In an MSA of 250,000 - 499,999"
      elif 500000 <= cbsa_population <= 999999:
          MSASIZE = "In an MSA of 500,000 - 999,999"
      elif 1000000 <= cbsa_population <= 2999999:
          MSASIZE = "In an MSA or CMSA of 1,000,000 - 2,999,999"
      elif cbsa_population >= 3000000:
          MSASIZE = "In an MSA or CMSA of 3 million or more"
  final_vector["MSASIZE"] = MSASIZE

  #Urban/Rural
  urban_rural, name = classify_urban(lat,lon)
  final_vector["URBRUR"] = urban_rural

  return(final_vector)



{'bbox': [-121.94851, 37.730525, -121.936961, 37.748871], 'routes': [{'summary': {'distance': 2632.0, 'duration': 248.1}, 'segments': [{'distance': 2632.0, 'duration': 248.1, 'steps': [{'distance': 348.8, 'duration': 83.7, 'type': 11, 'instruction': 'Head southeast on Northland Avenue', 'name': 'Northland Avenue', 'way_points': [0, 8]}, {'distance': 2283.3, 'duration': 164.4, 'type': 0, 'instruction': 'Turn left onto Broadmoor Drive', 'name': 'Broadmoor Drive', 'way_points': [8, 66]}, {'distance': 0.0, 'duration': 0.0, 'type': 10, 'instruction': 'Arrive at Broadmoor Drive, straight ahead', 'name': '-', 'way_points': [66, 66]}]}], 'bbox': [-121.94851, 37.730525, -121.936961, 37.748871], 'geometry': 'ykheF~mggV?AhCuHPi@BW?o@Ks@Yq@mBoC_A`A{@r@{@h@}@Zi@F[@m@Ei@Ks@Ga@?UD}@^cAl@k@`@OPQPWb@aAdBq@p@YTs@X_AToALgAb@g@Za@^_@Vk@Va@L_@D{ALw@LaA`@cAl@eCpAiAl@e@^e@l@mA`CsAvBeA~AwCvEi@n@W\\}BnBiEbC{C`Bk@Rq@Pu@Jc@BcB?k@Ho@Ps@Xw@d@aAh@cB~@', 'way_points': [0, 66]}], 'metadata': {'attribution': 'openrout

{'SCHOOL_DISTANCE': 1.635448472,
 'CENSUS_REGION': 'Pacific',
 'HHFAMINC': '$200,000 or more',
 'HH_RACE': 'White',
 'HH_HISP': 'No',
 'MSASIZE': 'In an MSA or CMSA of 3 million or more',
 'URBRUR': 'Urban'}

It works!!! Here is one mega code segment of code functions that can be pasted into the final product.

In [100]:
import requests
import pandas as pd
import geopandas as gpd
import requests
import zipfile
import io
from shapely.geometry import Point

# ----------------------------------------------------
# Configuration: URL of shapefile ZIP
UAC20_URL = "https://www2.census.gov/geo/tiger/TIGER2020/UAC/tl_2020_us_uac20.zip"
# Get a free key from https://openrouteservice.org/sign-up/
API_KEY = "eyJvcmciOiI1YjNjZTM1OTc4NTExMTAwMDFjZjYyNDgiLCJpZCI6ImFiMTU3YmFjMzYxNzQ3MGRhZGY5ZWQ4MTFmOTE0ZGZiIiwiaCI6Im11cm11cjY0In0="

STATE_TO_DIVISION = {
    # New England
    "09":"New England","23":"New England","25":"New England","33":"New England","44":"New England","50":"New England",

    # Middle Atlantic
    "34":"Middle Atlantic","36":"Middle Atlantic","42":"Middle Atlantic",

    # East North Central
    "17":"East North Central","18":"East North Central","26":"East North Central","39":"East North Central","55":"East North Central",

    # West North Central
    "19":"West North Central","20":"West North Central","27":"West North Central","29":"West North Central",
    "31":"West North Central","38":"West North Central","46":"West North Central",

    # South Atlantic
    "10":"South Atlantic","11":"South Atlantic","12":"South Atlantic","13":"South Atlantic",
    "24":"South Atlantic","37":"South Atlantic","45":"South Atlantic","51":"South Atlantic","54":"South Atlantic",

    # East South Central
    "01":"East South Central","21":"East South Central","28":"East South Central","47":"East South Central",

    # West South Central
    "05":"West South Central","22":"West South Central","40":"West South Central","48":"West South Central",

    # Mountain
    "04":"Mountain","08":"Mountain","16":"Mountain","30":"Mountain",
    "32":"Mountain","35":"Mountain","49":"Mountain","56":"Mountain",

    # Pacific
    "02":"Pacific","06":"Pacific","15":"Pacific","41":"Pacific","53":"Pacific"
}


def geocode_address_census(address):
    url = "https://geocoding.geo.census.gov/geocoder/locations/onelineaddress"

    params = {
        "address": address,
        "benchmark": "Public_AR_Current",
        "format": "json"
    }

    response = requests.get(url, params=params).json()
    matches = response["result"]["addressMatches"]

    if len(matches) == 0:
        raise ValueError("Address not found.")

    coords = matches[0]["coordinates"]
    return coords["y"], coords["x"]   # (lat, lon)


def get_fips_from_coords(lat, lon):
    """
    Given lat/lon, return:
    - state FIPS
    - county FIPS
    - tract code
    - block group code
    """
    url = f"https://geo.fcc.gov/api/census/block/find?latitude={lat}&longitude={lon}&format=json"
    response = requests.get(url).json()

    block_fips = response["Block"]["FIPS"]  # 15-digit block code

    state_fips = block_fips[:2]       # 2 digits
    county_fips = block_fips[2:5]     # 3 digits
    tract = block_fips[5:11]          # 6 digits
    block = block_fips[11:]           # 4 digits
    block_group = block[0]            # 1 digit

    return state_fips, county_fips, tract, block_group


def get_MSA_status(state_fips, county_fips):
  url = "https://www2.census.gov/programs-surveys/metro-micro/geographies/reference-files/2023/delineation-files/list1_2023.xlsx"
  cbsa_crosswalk = pd.read_excel(url, dtype=str,header=2)
  val = cbsa_crosswalk[(cbsa_crosswalk["FIPS State Code"] == state_fips) & (cbsa_crosswalk["FIPS County Code"] == county_fips)]
  return val["CBSA Code"].tolist()[0], val["Metropolitan/Micropolitan Statistical Area"].tolist()[0]

def get_population_by_cbsa(cbsa_code):
    """
    Given a CBSA code, fetch total population (B01003) from ACS5 API.
    Returns the population as an integer.
    """
    base_url = "https://api.census.gov/data/2022/acs/acs5"
    params = {
        "get": "B01003_001E",  # Total population estimate
        "for": f"metropolitan statistical area/micropolitan statistical area:{cbsa_code}",
        #"key": CENSUS_API_KEY
    }

    response = requests.get(base_url, params=params)
    response.raise_for_status()  # Raise an error if the request failed

    data = response.json()
    # The first row is column names, second row is the values
    pop = int(data[1][0])
    return pop


# ----------------------------------------------------
# Helper to load shapefile into GeoDataFrame
def load_urban_areas_gdf(url=UAC20_URL):
    # Download zip into bytes
    r = requests.get(url)
    r.raise_for_status()
    z = zipfile.ZipFile(io.BytesIO(r.content))

    # Find the .shp file name inside the zip
    shapefile_name = [f for f in z.namelist() if f.endswith(".shp")][0]

    # Extract all files into memory buffer
    z.extractall("/tmp/tl_uac20")

    # Load with GeoPandas
    gdf = gpd.read_file(f"/tmp/tl_uac20/{shapefile_name}")
    # Ensure it's in WGS84 lat/lon
    gdf = gdf.to_crs(epsg=4326)
    return gdf

# Load once
urban_gdf = load_urban_areas_gdf()

# ----------------------------------------------------
def classify_urban(lat, lon, gdf=urban_gdf):
    """
    Returns:
      - 'Urban' if the point is inside any urban polygon
      - 'Rural' otherwise
      - urban area name if inside urban, else None
    """
    pt = Point(lon, lat)
    match = gdf[gdf.contains(pt)]
    if not match.empty:
        # Inside some urban polygon
        name = match.iloc[0]["NAME20"]
        return "Urban", name
    else:
        return "Rural", None  # Not in urban area



def get_driving_distance_ors(address1, address2):
    """
    Returns driving distance in kilometers and duration in minutes using OpenRouteService.
    """
    # First, geocode addresses using ORS
    def geocode(address):
        url = "https://api.openrouteservice.org/geocode/search"
        params = {"api_key": API_KEY, "text": address, "size": 1}
        resp = requests.get(url, params=params).json()
        if len(resp["features"]) == 0:
            raise ValueError(f"Address not found: {address}")
        coords = resp["features"][0]["geometry"]["coordinates"]  # [lon, lat]
        return coords

    start_coords = geocode(address1)
    end_coords = geocode(address2)

    # Call directions endpoint
    url = "https://api.openrouteservice.org/v2/directions/driving-car"
    headers = {"Authorization": API_KEY, "Content-Type": "application/json"}
    body = {
        "coordinates": [start_coords, end_coords]
    }
    resp = requests.post(url, json=body, headers=headers).json()
    route = resp["routes"][0]["summary"]
    distance_mi = route["distance"] / 1000 * 0.621371
    duration_min = route["duration"] / 60

    return distance_mi, duration_min


def get_census_data(state_fips, county_fips, tract, block_group, api_key=None):
    """
    Fetch block group median income, race/Hispanic counts, and CBSA total population
    in one function call.

    Parameters:
        state_fips (str): 2-digit state FIPS
        county_fips (str): 3-digit county FIPS
        tract (str): 6-digit tract code
        block_group (str): 1-digit block group code
        cbsa_code (str): CBSA code for MSA
        api_key (str, optional): Your Census API key

    Returns:
        dict: {
            "median_income": int or "No data",
            "race_counts": dict of race/Hispanic counts,
            "cbsa_population": int
        }
    """
    # ---------- Block group: median income + race/Hispanic ----------
    base_bg = "https://api.census.gov/data/2022/acs/acs5"

    race_vars = [
        "B02001_002E",  # White
        "B02001_003E",  # Black or African American
        "B02001_004E",  # Asian
        "B02001_005E",  # American Indian or Alaska Native
        "B02001_006E",  # Native Hawaiian or other Pacific Islander
        "B02001_007E",  # Other race
        "B02001_008E",  # Two or more races
    ]
    hisp_vars = [
        "B03003_002E",  # Not Hispanic
        "B03003_003E"   # Hispanic
    ]

    all_vars = ["B19013_001E"] + race_vars + hisp_vars
    var_str = ",".join(all_vars)

    params_bg = {
        "get": var_str,
        "for": f"block group:{block_group}",
        "in": f"state:{state_fips}+county:{county_fips}+tract:{tract}"
    }
    if api_key:
        params_bg["key"] = api_key

    response_bg = requests.get(base_bg, params=params_bg)
    response_bg.raise_for_status()
    data_bg = response_bg.json()

    # Median income
    median_income_raw = data_bg[1][0]
    median_income = int(median_income_raw) if median_income_raw not in [None, "", "null"] else "No data"

    # Race/Hispanic counts
    counts_raw = dict(zip(data_bg[0][1:], data_bg[1][1:]))  # skip median income key
    race_counts = {k: int(v) for k, v in counts_raw.items()}

    # ---------- CBSA population ----------
    cbsa_code, msa_status = get_MSA_status(state_fips, county_fips)
    if "Micropolitan" in msa_status:
      return {
        "median_income": median_income,
        "race_counts": race_counts,
        "cbsa_population": "Not in MSA/CMSA"
      }

    base_cbsa = "https://api.census.gov/data/2022/acs/acs5"
    params_cbsa = {
        "get": "B01003_001E",
        "for": f"metropolitan statistical area/micropolitan statistical area:{cbsa_code}"
    }
    if api_key:
        params_cbsa["key"] = api_key

    response_cbsa = requests.get(base_cbsa, params=params_cbsa)
    response_cbsa.raise_for_status()
    data_cbsa = response_cbsa.json()
    cbsa_population = int(data_cbsa[1][0])

    # ---------- Combine results ----------
    return {
        "median_income": median_income,
        "race_counts": race_counts,
        "cbsa_population": cbsa_population
    }

def vector_generator(home_address, school_address):
  final_vector = {'SCHOOL_DISTANCE':None, 'CENSUS_REGION':None, 'HHFAMINC':None, 'HH_RACE':None, 'HH_HISP':None, 'MSASIZE':None, 'URBRUR':None}

  lat, lon = geocode_address_census(home_address)
  state_fips, county_fips, tract, block_group = get_fips_from_coords(lat,lon)

  #school distance
  mi, min = get_driving_distance_ors(home_address, school_address)
  final_vector["SCHOOL_DISTANCE"] = mi

  #census region
  final_vector["CENSUS_REGION"] = STATE_TO_DIVISION[state_fips]

  #race, hisp, and median income
  census_data = get_census_data(state_fips, county_fips, tract, block_group)

  #race data
  race_counts = census_data['race_counts']
  race_mapping = {
        "White": race_counts.get("B02001_002E", 0),
        "Black or African American": race_counts.get("B02001_003E", 0),
        "Asian": race_counts.get("B02001_004E", 0),
        "American Indian or Alaska Native": race_counts.get("B02001_005E", 0),
        "Native Hawaiian or other Pacific Islander": race_counts.get("B02001_006E", 0),
        "Other": race_counts.get("B02001_007E", 0) + race_counts.get("B02001_008E", 0)
  }
  # Choose the race with the highest count
  final_vector["HH_RACE"] = max(race_mapping, key=race_mapping.get) if race_mapping else "No data"

  #hisp data
  # 003E = Hispanic, 002E = Not Hispanic
  hisp_counts = race_counts
  if hisp_counts.get("B03003_003E", 0) > hisp_counts.get("B03003_002E", 0):
      final_vector["HH_HISP"] = "Yes"
  elif hisp_counts.get("B03003_003E", 0) <= hisp_counts.get("B03003_002E", 0):
      final_vector["HH_HISP"] = "No"
  else:
      final_vector["HH_HISP"] = "No data"

  #median income
  median_income = census_data['median_income']
  income_bins = [
        "Less than $10,000",
        "$10,000 to $14,999",
        "$15,000 to $24,999",
        "$25,000 to $34,999",
        "$35,000 to $49,999",
        "$50,000 to $74,999",
        "$75,000 to $99,999",
        "$100,000 to $124,999",
        "$125,000 to $149,999",
        "$150,000 to $199,999",
        "$200,000 or more"
  ]

  final_vector["HHFAMINC"] = "No data"
  if isinstance(median_income, int):
      if median_income < 10000:
          final_vector["HHFAMINC"] = "Less than $10,000"
      elif 10000 <= median_income <= 14999:
          final_vector["HHFAMINC"] = "$10,000 to $14,999"
      elif 15000 <= median_income <= 24999:
          final_vector["HHFAMINC"] = "$15,000 to $24,999"
      elif 25000 <= median_income <= 34999:
          final_vector["HHFAMINC"] = "$25,000 to $34,999"
      elif 35000 <= median_income <= 49999:
          final_vector["HHFAMINC"] = "$35,000 to $49,999"
      elif 50000 <= median_income <= 74999:
          final_vector["HHFAMINC"] = "$50,000 to $74,999"
      elif 75000 <= median_income <= 99999:
          final_vector["HHFAMINC"] = "$75,000 to $99,999"
      elif 100000 <= median_income <= 124999:
          final_vector["HHFAMINC"] = "$100,000 to $124,999"
      elif 125000 <= median_income <= 149999:
          final_vector["HHFAMINC"] = "$125,000 to $149,999"
      elif 150000 <= median_income <= 199999:
          final_vector["HHFAMINC"] = "$150,000 to $199,999"
      elif median_income >= 200000:
          final_vector["HHFAMINC"] = "$200,000 or more"

  #MSA
  cbsa_population = census_data['cbsa_population']
  MSASIZE = "Not in MSA or CMSA"
  if isinstance(cbsa_population, int):
      if cbsa_population < 250000:
          MSASIZE = "In an MSA of Less than 250,000"
      elif 250000 <= cbsa_population <= 499999:
          MSASIZE = "In an MSA of 250,000 - 499,999"
      elif 500000 <= cbsa_population <= 999999:
          MSASIZE = "In an MSA of 500,000 - 999,999"
      elif 1000000 <= cbsa_population <= 2999999:
          MSASIZE = "In an MSA or CMSA of 1,000,000 - 2,999,999"
      elif cbsa_population >= 3000000:
          MSASIZE = "In an MSA or CMSA of 3 million or more"
  final_vector["MSASIZE"] = MSASIZE

  #Urban/Rural
  urban_rural, name = classify_urban(lat,lon)
  final_vector["URBRUR"] = urban_rural

  row_df = pd.DataFrame([final_vector])

  return(row_df)


Let's test.

In [101]:
home = "7535 Northland Avenue, San Ramon, CA"
school = "9870 Broadmoor Drive, San Ramon, CA"

vector_generator(home, school)

Unnamed: 0,SCHOOL_DISTANCE,CENSUS_REGION,HHFAMINC,HH_RACE,HH_HISP,MSASIZE,URBRUR
0,1.635448,Pacific,"$200,000 or more",White,No,In an MSA or CMSA of 3 million or more,Urban
