# Climate Exposure Dataset

- Contains information on climate exposure variables: temperature, humidity, smoke PM and wind speed
- Contains information on population counts and population density
- Cover the year 2020
- Spatial coverage: California (census tract)

In [293]:
import numpy as np
import pandas as pd
import xarray as xr
import seaborn as sns
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from shapely.geometry import Polygon

## Load temperature and humidity

In [294]:
# temp data is from https://prism.oregonstate.edu
df = pd.read_parquet("data/esri_tmin_tmax_pm25_merged.parquet")
rhum = xr.open_dataset("data/esri/RelHum_2006_2021_Cali.nc")
rhum = rhum[["FIPS", "RELATIVE_HUMIDITY_NONE_SPATIAL_NEIGHBORS"]].to_dataframe()
rhum = rhum.groupby(["time", "FIPS"]).first()
df_ = df.join(rhum, rsuffix="_rhum")
df_ = df_.rename(columns={"RELATIVE_HUMIDITY_NONE_SPATIAL_NEIGHBORS": "rhum"})
df = df_.reset_index()

In [295]:
df = df[df["time"].dt.year == 2020]
df = df.drop(columns="pm25")
df.FIPS = df.FIPS.astype(int)

In [296]:
gdf = gpd.read_file("data/esri/FIPSMapping.gdb")
gdf["FIPS_1"] = gdf.FIPS_1.astype(int)

In [297]:
df = df.merge(gdf, left_on="FIPS", right_on="FIPS")

In [299]:
df.head()

Unnamed: 0,time,FIPS,tmax,tmin,rhum,lat,lon,FIPS_1,Shape_Length,Shape_Area,geometry
0,2020-01-01,1765,16.550013,7.649988,88.699997,37.867656,-122.231882,6001400100,14242.190056,6945876.0,"MULTIPOLYGON (((-2257301.793 356496.510, -2257..."
1,2020-01-02,1765,15.749994,7.649988,77.400002,37.867656,-122.231882,6001400100,14242.190056,6945876.0,"MULTIPOLYGON (((-2257301.793 356496.510, -2257..."
2,2020-01-03,1765,12.85,6.149988,100.0,37.867656,-122.231882,6001400100,14242.190056,6945876.0,"MULTIPOLYGON (((-2257301.793 356496.510, -2257..."
3,2020-01-04,1765,13.749994,5.249994,93.699997,37.867656,-122.231882,6001400100,14242.190056,6945876.0,"MULTIPOLYGON (((-2257301.793 356496.510, -2257..."
4,2020-01-05,1765,13.249994,6.550012,75.0,37.867656,-122.231882,6001400100,14242.190056,6945876.0,"MULTIPOLYGON (((-2257301.793 356496.510, -2257..."


In [300]:
df = df.drop(columns=["FIPS", "Shape_Length", "Shape_Area"])

## Add wildfire smoke PM2.5

In [301]:
# data from https://github.com/echolab-stanford/daily-10km-smokePM
pspm25 = pd.read_csv("data/smoke_pm_2020.csv")
pspm25["date"] = pd.to_datetime(pspm25["date"])
pspm25.head()
df = df.merge(pspm25, right_on=["GEOID", "date"], left_on=["FIPS_1", "time"])

In [302]:
df = df.drop(columns=["FIPS_1", "date", "tmin"])

In [303]:
df.head()

Unnamed: 0,time,tmax,rhum,lat,lon,geometry,GEOID,smoke_pm
0,2020-01-01,16.550013,88.699997,37.867656,-122.231882,"MULTIPOLYGON (((-2257301.793 356496.510, -2257...",6001400100,0.0
1,2020-01-02,15.749994,77.400002,37.867656,-122.231882,"MULTIPOLYGON (((-2257301.793 356496.510, -2257...",6001400100,0.0
2,2020-01-03,12.85,100.0,37.867656,-122.231882,"MULTIPOLYGON (((-2257301.793 356496.510, -2257...",6001400100,0.0
3,2020-01-04,13.749994,93.699997,37.867656,-122.231882,"MULTIPOLYGON (((-2257301.793 356496.510, -2257...",6001400100,0.0
4,2020-01-05,13.249994,75.0,37.867656,-122.231882,"MULTIPOLYGON (((-2257301.793 356496.510, -2257...",6001400100,0.0


## Add wind

In [304]:
# wind data comes from: https://www.ncei.noaa.gov/access/monitoring/wind/
windu = xr.open_dataset("data/uwnd.10m.gauss.2020.nc")
windv = xr.open_dataset("data/vwnd.10m.gauss.2020.nc")
dfwindu = windu["uwnd"].to_dataframe().reset_index()
dfwindv = windv["vwnd"].to_dataframe().reset_index()
dfwindu["lon"] = ((dfwindu["lon"] + 180) % 360) - 180
dfwindv["lon"] = ((dfwindv["lon"] + 180) % 360) - 180
dfwindu["GEOID"] = (
    dfwindu["lon"].round(3).astype(str) + "_" + dfwindu["lat"].round(3).astype(str)
)
dfwindv["GEOID"] = (
    dfwindv["lon"].round(3).astype(str) + "_" + dfwindv["lat"].round(3).astype(str)
)

## Get avg summer month values

In [305]:
df["month"] = df["time"].dt.month
dfwindu["month"] = dfwindu["time"].dt.month
dfwindv["month"] = dfwindv["time"].dt.month


# Function to calculate the monthly averages for each variable
def calculate_monthly_averages(df, variable):
    grouped = df.groupby(["GEOID", "month"])[variable].mean().reset_index()
    pivoted = grouped.pivot_table(index="GEOID", columns="month", values=variable)
    pivoted = pivoted.rename(
        columns={6: "jun", 7: "jul", 8: "aug", 9: "sep", 10: "oct"}
    )
    pivoted = pivoted.drop(columns=[1, 2, 3, 4, 5, 11, 12])
    pivoted.columns = [
        f"avg_{variable}_{month}" for month in ["jun", "jul", "aug", "sep", "oct"]
    ]
    return pivoted

In [306]:
tmax_monthly = calculate_monthly_averages(df, "tmax")
rhum_monthly = calculate_monthly_averages(df, "rhum")
smokepm_monthly = calculate_monthly_averages(df, "smoke_pm")

In [324]:
result = pd.concat([tmax_monthly, rhum_monthly, smokepm_monthly], axis=1)
result.reset_index(inplace=True)

In [307]:
dfwindu = dfwindu.rename(columns={"uwnd": "wnd"})
dfwindv = dfwindv.rename(columns={"vwnd": "wnd"})
dfwindu_monthly = calculate_monthly_averages(dfwindu, "wnd")
dfwindv_monthly = calculate_monthly_averages(dfwindv, "wnd")

In [308]:
wnd = df.groupby(["lon", "lat"])["GEOID"].first().reset_index()

In [309]:
helper = dfwindu.groupby("GEOID")[["lon", "lat"]].first()

dfwindu_monthly = dfwindu_monthly.merge(helper, on="GEOID")
dfwindv_monthly = dfwindv_monthly.merge(helper, on="GEOID")

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

for var in ["avg_wnd_jun", "avg_wnd_jul", "avg_wnd_aug", "avg_wnd_sep", "avg_wnd_oct"]:
    # Set number of neighbors to look for
    neighbors = 8
    # Initialize KNN regressor
    knn_regressor_windu = KNeighborsRegressor(n_neighbors=neighbors, weights="distance")
    knn_regressor_windv = KNeighborsRegressor(n_neighbors=neighbors, weights="distance")

    knn_regressor_windu.fit(
        dfwindu_monthly[["lon", "lat"]].values, dfwindu_monthly[[var]].values
    )
    knn_regressor_windv.fit(
        dfwindv_monthly[["lon", "lat"]].values, dfwindv_monthly[[var]].values
    )

    windu_pred = knn_regressor_windu.predict(wnd[["lon", "lat"]].values)
    windv_pred = knn_regressor_windv.predict(wnd[["lon", "lat"]].values)

    wnd[var] = np.sqrt(windu_pred**2 + windv_pred**2)

In [310]:
wnd = wnd.drop(columns=["lon", "lat"])

In [311]:
wnd.shape

(8673, 6)

In [325]:
result = result.merge(wnd, on="GEOID")

## Add confounders

In [313]:
pop = pd.read_csv(
    "data/esri/crosswalk/FIPSMapping_gdb_censustract_2020_ca.csv.csv",
    usecols=["FIPS", "POPULATION_2020", "POP20_SQMI"],
)

In [314]:
svi = pd.read_csv(
    "../data/svi_california.csv",
    usecols=[
        "GEOID",
        "EP_POV150",
        "EP_UNEMP",
        "EP_UNINSUR",
        "EP_AGE65",
        "EP_AGE17",
        "EP_LIMENG",
        "EP_MINRTY",
        "RPL_THEMES",
        "E_NOINT",
    ],
    dtype={"GEOID": str},
)

In [315]:
svi["GEOID_"] = svi["GEOID"].astype(int)

In [316]:
# exclude NaNs
svi = svi[svi["RPL_THEMES"] > -1]

In [326]:
result = result.merge(pop, left_on="GEOID", right_on="FIPS")

In [327]:
result = result.merge(svi, left_on="GEOID", right_on="GEOID_")

In [330]:
result = result.drop(
    columns=[
        "GEOID_x",
        "GEOID_",
        "FIPS",
    ]
)
result = result.rename(columns={"GEOID_y": "GEOID"})

In [334]:
result = result.set_index("GEOID")

In [335]:
result.to_csv("../data/climate_exposure_ca_2020.csv")