In [141]:
from IPython.display import HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', None)

In [143]:
import pandas as pd
import numpy as np
import os
import requests
import censusgeocode as cg 
from typing import Optional

### Displays all columns of a dataframe
pd.set_option('display.max_columns', None)

### Displays all rows of a dataframe
pd.set_option('display.max_rows', None)


### Instructions

1. Replace PATH with the location of your data file, and DATA_FILE with the name of your file.
2. If there is a seperate column for zip code, replace ZIP_COL_NAME with your dataframe's zipcode column name. If not, remove "dtype = {ZIP_COL_NAME:str})".
3. Enter the names of the personal identifying information (PII) columns in the PII_COLS list.
4. In order to get the census tracts from the addresses, we need to have the address in this format, "street address, city, state zip code, country".

### Config

In [156]:
PATH = "C:/Users/john/Documents/GitHub_Data/address_sample_data"
DATA_FILE = "san_francisco_df.csv"
ZIP_COL_NAME = "ZIP Code"
API_KEY = "AIzaSyBS7ZMV28dg8OcVi6x7NDxAThGeOcUZL54"
PII_COLS = ["Address", "City", "State", "ZIP Code", "Country", "full_address", "lat", "lng"]
census_tract_join_dict = {"id" : [],
                           "full_census_tract" : []}

### Reading in Data

In [151]:
os.chdir(PATH)
address_sample = pd.read_csv("san_francisco_df.csv",
                            dtype = {ZIP_COL_NAME:str})

  address_sample = pd.read_csv("san_francisco_df.csv",


### Prepping Data

In [101]:
address_sample["City"] = "San Francisco"
address_sample["State"] = "CA"
address_sample["Country"] = "USA"
address_sample = address_sample.rename(columns = {"EAS BaseID":"id"})
new_address_df = address_sample[["id", "Address", "City", "State", "ZIP Code", "Country"]]
new_address_df["full_address"] = new_address_df["Address"] + ", " + new_address_df["City"] + ", " + new_address_df["State"] + " " + new_address_df["ZIP Code"] + ", " + new_address_df["Country"]
quick_sample = new_address_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_address_df["full_address"] = new_address_df["Address"] + ", " + new_address_df["City"] + ", " + new_address_df["State"] + " " + new_address_df["ZIP Code"] + ", " + new_address_df["Country"]


### Functions

In [135]:

def geocode_address(address: str, api_key: str):
    """
    Takes a street address and returns (lat, lng) using Google Geocoding API.
    """
    url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {"address": address, "key": api_key}
    
    response = requests.get(url, params=params, timeout=10)
    response.raise_for_status()
    data = response.json()
    
    if data["status"] == "OK":
        location = data["results"][0]["geometry"]["location"]
        return location["lat"], location["lng"]
    else:
        print(f"Error: {data['status']}")
        return None, None


        
def add_census_tract(
    df: pd.DataFrame,
    lat_col: str = "lat",
    lng_col: str = "lng",
    out_col: str = "census_tract",
    include_geoid: bool = False,
    remove_pii: bool = False,
    geoid_col: str = "census_tract_geoid"
) -> pd.DataFrame:
    """
    Add a Census tract column to a DataFrame using latitude/longitude via censusgeocode.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe that contains latitude and longitude columns.
    lat_col : str
        Column name for latitude.
    lng_col : str
        Column name for longitude.
    out_col : str
        Output column name for the tract label (e.g., "Tract 1234.56; King County; Washington").
    include_geoid : bool
        If True, also add an 11-digit tract GEOID column.
    geoid_col : str
        Column name for the optional tract GEOID.

    Returns
    -------
    pd.DataFrame
        The same dataframe with a new tract column (and optional GEOID).
    """
    # Basic validation
    if lat_col not in df.columns or lng_col not in df.columns:
        raise ValueError(f"DataFrame must contain '{lat_col}' and '{lng_col}' columns.")

    # Local cache to avoid repeated API calls for the same coordinates
    cache = {}

    def lookup(lat, lng):
        key = (lat, lng)
        if pd.isna(lat) or pd.isna(lng):
            return ("Census Tract Not Found", None)

        if key in cache:
            return cache[key]

        try:
            res = cg.coordinates(x=float(lng), y=float(lat))
            # Expect lists; take the first match
            tract = res["Census Tracts"][0] if res.get("Census Tracts") else None
            county = res["Counties"][0] if res.get("Counties") else None
            state  = res["States"][0] if res.get("States") else None

            if tract and county and state:
                label = f"{tract['NAME']}; {county['NAME']}; {state['BASENAME']}"
                geoid = tract.get("GEOID")
                cache[key] = (label, geoid)
                return cache[key]
            else:
                cache[key] = ("Census Tract Not Found", None)
                return cache[key]

        except (ValueError, KeyError, IndexError, TypeError):
            cache[key] = ("Census Tract Not Found", None)
            return cache[key]

    # Apply lookup row-wise (using zip is slightly faster than df.apply here)
    labels = []
    geoids = []
    for lat, lng in zip(df[lat_col], df[lng_col]):
        label, geoid = lookup(lat, lng)
        labels.append(label)
        geoids.append(geoid)

    df = df.copy()
    df[out_col] = labels
    if include_geoid:
        df[geoid_col] = geoids

    if remove_pii:
        df = df[["id", "census_tract", "census_tract_geoid"]]

    return df


### Grabbing lat and long

In [106]:
quick_sample["lat"] = quick_sample["full_address"].apply(lambda x : geocode_address(x, API_KEY)[0])
quick_sample["lng"] = quick_sample["full_address"].apply(lambda x : geocode_address(x, API_KEY)[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quick_sample["lat"] = quick_sample["full_address"].apply(lambda x : geocode_address(x, API_KEY)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quick_sample["lng"] = quick_sample["full_address"].apply(lambda x : geocode_address(x, API_KEY)[1])


### Grabbing County and Census Tracts for sample of apartments

In [153]:
quick_sample_1 = add_census_tract(quick_sample, include_geoid=True, remove_pii = False)
quick_sample_1

Unnamed: 0,id,Address,City,State,ZIP Code,Country,full_address,lat,lng,census_tract,census_tract_geoid
0,387437,1516 FELTON ST,San Francisco,CA,94134,USA,"1516 FELTON ST, San Francisco, CA 94134, USA",37.726243,-122.419832,Census Tract 256; San Francisco County; Califo...,6075025600
1,288062,964 EDDY ST,San Francisco,CA,94109,USA,"964 EDDY ST, San Francisco, CA 94109, USA",37.782788,-122.423668,Census Tract 160; San Francisco County; Califo...,6075016000
2,409398,206 HAROLD AVE,San Francisco,CA,94112,USA,"206 HAROLD AVE, San Francisco, CA 94112, USA",37.722315,-122.452951,Census Tract 312.01; San Francisco County; Cal...,6075031201
3,406986,179 MANGELS AVE,San Francisco,CA,94131,USA,"179 MANGELS AVE, San Francisco, CA 94131, USA",37.732927,-122.441573,Census Tract 311; San Francisco County; Califo...,6075031100
4,362645,2920 22ND ST,San Francisco,CA,94110,USA,"2920 22ND ST, San Francisco, CA 94110, USA",37.755974,-122.412872,Census Tract 228.03; San Francisco County; Cal...,6075022803
