In [113]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import numpy as np
from pathlib import Path

## Tract map intersection with 10km_grid

In [114]:
unit_gdf = gpd.read_file('../shapefiles/tl_2020_06_tract/tl_2020_06_tract.shp')

In [115]:
grid_10km = gpd.read_file('10km_grid_wgs84/10km_grid_wgs84.shp')

In [116]:
grid_10km = grid_10km.to_crs(unit_gdf.crs)

In [117]:
# make a crosswalk with intersection area with grid cells
unit_cross = gpd.overlay(unit_gdf, grid_10km, how='intersection')

In [118]:
unit_cross = unit_cross.to_crs('EPSG:2227')

In [119]:
unit_cross['area'] = unit_cross.geometry.area

In [120]:
unit_cross = unit_cross[['GEOID', 'ID', 'area']]

## Load population

In [121]:
csv_files = [
    "populationDensity_10km_subgrid_1.csv",
    "populationDensity_10km_subgrid_2.csv",
    "populationDensity_10km_subgrid_3.csv",
    "populationDensity_10km_subgrid_4.csv",
    "populationDensity_10km_subgrid_5.csv",
    "populationDensity_10km_subgrid_6.csv",
    "populationDensity_10km_subgrid_7.csv",
    "populationDensity_10km_subgrid_8.csv",
    "populationDensity_10km_subgrid_9.csv",
    "populationDensity_10km_subgrid_10.csv",
]

# Read and combine CSV files
dfs = [pd.read_csv("populationDensity_10km_subgrid/"+file) for file in csv_files]
pop = pd.concat(dfs, ignore_index=True)

## Load Smoke PM

In [158]:
df = pd.read_csv('smokePM2pt5_predictions_daily_10km_20060101-20201231.csv')
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

## Get only 2020 for Space

In [159]:
len(df)

51434138

In [160]:
# df = df[df["date"].dt.year==2020]

In [161]:
len(df)

51434138

## Get weighted average on Tract

In [162]:
# loop through work with one date at a time, concatenate the data frames from the dates
unique_dates = df['date'].unique()

In [163]:
def weighted_mean(df, values_col, weights_col):
    temp = (df[values_col] * df[weights_col]).sum()
    if temp:
        return temp / df[weights_col].sum()
    return temp

In [164]:
smoke_pm = df

In [None]:
avg_unit_smoke_pm = []
for date in unique_dates:
    smoke_pm_date = smoke_pm[smoke_pm['date'] == date]

    smoke_pm_date_joined = unit_cross.merge(smoke_pm_date, left_on='ID', right_on='grid_id_10km', how='left')
    smoke_pm_date_joined = smoke_pm_date_joined.merge(pop, on='ID', how='left')

    smoke_pm_date_joined['smokePM_pred'] = smoke_pm_date_joined['smokePM_pred'].fillna(0)
    smoke_pm_date_joined['pop'] = smoke_pm_date_joined['mean'] * smoke_pm_date_joined['area']

    smoke_pm_date_agg = smoke_pm_date_joined.groupby(['GEOID']).apply(
        lambda x: weighted_mean(x, 'smokePM_pred', 'pop')).reset_index()

    #smoke_pm_date_agg.columns = ['GEOID', 'date', 'smokePM_pred']
    smoke_pm_date_agg["date"] = date
    avg_unit_smoke_pm.append(smoke_pm_date_agg)

avg_unit_smoke_pm = pd.concat(avg_unit_smoke_pm, ignore_index=True)

In [None]:
avg_unit_smoke_pm[avg_unit_smoke_pm[0] != 0]

In [None]:
avg_unit_smoke_pm = avg_unit_smoke_pm.rename(columns= {0:"smoke_pm"})

In [None]:
# Save the aggregated PM2.5 data
# avg_unit_smoke_pm.to_csv("smoke_pm_2020.csv", index=False)

In [None]:
# Save the aggregated PM2.5 data
avg_unit_smoke_pm.to_csv("smoke_pm_all_time_2020_map.csv", index=False)

In [157]:
avg_unit_smoke_pm.date.max()

Timestamp('2020-12-31 00:00:00')