## Working with components

We need to figure out how to load all of the component as listed in the PR. Just following the code in aggregate_components.py 

In [1]:
import xarray
import rasterio
import pandas as pd
import geopandas as gpd
import numpy as np
import hydra
import logging  
import matplotlib.pyplot as plt

In [2]:
from hydra.core.hydra_config import HydraConfig

Get the utils and src packages ready

In [3]:
import os
import sys

sys.path.append("../")
from utils.faster_zonal_stats import polygon_to_raster_cells

In [4]:
# configure logger to print at info level
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

In [5]:
def available_shapefile_year(year, shapefile_years_list: list):
    """
    Given a list of shapefile years,
    return the latest year in the shapefile_years_list that is less than or equal to the given year
    """
    for shapefile_year in sorted(shapefile_years_list, reverse=True):
        if year >= shapefile_year:
            return shapefile_year
 
    return min(shapefile_years_list)  # Returns the last element if year is greater than the last element


In [6]:
from hydra import initialize, compose
from omegaconf import OmegaConf

# unfortunately, we have to use the initialize function to load the config file
# this is because the @hydra decorator does not work with Notebooks very well
# this is a known issue with Hydra: https://gist.github.com/bdsaglam/586704a98336a0cf0a65a6e7c247d248
# 
# just use the relative path from the notebook to the config dir
with initialize(version_base=None, config_path="../conf"):
    cfg = compose(config_name='config.yaml')

hydra_cfg = cfg

Getting the list of shapefiles for debugging:

In [7]:
temporal_freq = cfg['temporal_freq']
polygon_name = cfg['polygon_name']
components = ['no3']

satellite_pm25_cfg = hydra_cfg.satellite_pm25
shapefiles_cfg = hydra_cfg.shapefiles

months_list = "01" if temporal_freq == 'yearly' else [str(i).zfill(2) for i in range(1, 12 + 1)]
years_list = list(range(1998, 2022 + 1))

In [8]:
LOGGER.info(f"Running for: {cfg.temporal_freq} {cfg.polygon_name} {cfg.year}")
#logging_dir = HydraConfig.get().runtime.output_dir

# == load shapefile
LOGGER.info("Loading shapefile.")
shapefile_years_list = list(cfg.shapefiles[cfg.polygon_name].keys())
#use previously available shapefile
shapefile_year = available_shapefile_year(cfg.year, shapefile_years_list)

shape_path = f'../data/input/shapefiles/shapefile_{cfg.polygon_name}_{shapefile_year}/shapefile.shp'
polygon = gpd.read_file(shape_path)
polygon_ids = polygon[cfg.shapefiles[cfg.polygon_name][shapefile_year].idvar].values


INFO:__main__:Running for: monthly zcta 2020
INFO:__main__:Loading shapefile.


In [9]:
# == filenames to be aggregated
if cfg.temporal_freq == "yearly":
    filenames = [
        f"{cfg.satellite_pm25[cfg.temporal_freq].file_prefix}.{cfg.year}01-{cfg.year}12.nc"
    ]
elif cfg.temporal_freq == "monthly": 
    # Note; will use the january file for obtaining the mapping from geometries to raster cells
    # the aggregation for the all the months will be done using the same mapping later
    filenames = []
    for m in range(1, 13):
        filenames.append(f"{cfg.satellite_pm25[cfg.temporal_freq].file_prefix}.{cfg.year}{m:02d}-{cfg.year}{m:02d}.nc")
else:
    raise ValueError(f"temporal_freq {cfg.temporal_freq} not supported")


In [18]:
print(filenames[12])
print(cfg.temporal_freq)

IndexError: list index out of range

In [11]:
#LOGGER.info("Mapping polygons to raster cells.")

ds = xarray.open_dataset(f"../data/input/pm25__washu__raw/{cfg.temporal_freq}/{filenames[0]}")

FileNotFoundError: [Errno 2] No such file or directory: '/net/rcstorenfs02/ifs/rc_labs/dominici_lab/lab/data_processing/tinashe_pm25_washu_raster2polygon/pm25_washu_raster2polygon/data/input/pm25__washu__raw/monthly/V5GL04.HybridPM25c_0p10.NorthAmerica.202001-202001.nc'

In [None]:
layer = getattr(ds, cfg.satellite_pm25.layer)

In [13]:
# obtain affine transform/boundaries
dims = layer.dims
assert len(dims) == 2, "netcdf coordinates must be 2d"
lon = layer[cfg.satellite_pm25.longitude_layer].values
lat = layer[cfg.satellite_pm25.latitude_layer].values
transform = rasterio.transform.from_origin(
    lon[0], lat[-1], lon[1] - lon[0], lat[1] - lat[0]
)

# compute mapping
poly2cells = polygon_to_raster_cells(
    polygon,
    layer.values[::-1],
    affine=transform,
    all_touched=True,
    nodata=np.nan,
    verbose=cfg.show_progress,
)


In [16]:
print(cfg)

{'temporal_freq': 'monthly', 'year': 2020, 'polygon_name': 'zcta', 'shapefile_year': 2020, 'show_progress': False, 'plot_output': False, 'component': 'no3', 'datapaths': {'input': {'pm25__washu__raw': {'yearly': '/n/netscratch/dominici_lab/Lab/pm25__washu__raw/yearly/', 'monthly': '/n/netscratch/dominici_lab/Lab/pm25__washu__raw/monthly/'}, 'shapefiles': '/n/dominici_lab/lab/data_processing/jonathan_pm25_washu_raster2polygon/pm25_washu_raster2polygon/data/input/shapefiles'}, 'output': {'pm25__washu': {'zcta_yearly': '/n/dominici_lab/lab/lego/environmental/pm25__washu/zcta_yearly', 'zcta_monthly': '/n/dominici_lab/lab/lego/environmental/pm25__washu/zcta_monthly', 'county_yearly': '/n/dominici_lab/lab/lego/environmental/pm25__washu/county_yearly', 'county_monthly': '/n/dominici_lab/lab/lego/environmental/pm25__washu/county_monthly'}}}, 'shapefiles': {'census_tract': {2020: {'url': 'https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_tract_500k.zip', 'idvar': 'GEOID'}, 2021: {'url':

In [15]:
for i, filename in enumerate(filenames):
    LOGGER.info(f"Aggregating {filename}")

INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202001-202001.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202002-202002.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202003-202003.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202004-202004.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202005-202005.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202006-202006.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202007-202007.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202008-202008.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202009-202009.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202010-202010.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202011-202011.nc
INFO:__main__:Aggregating V5GL04.HybridPM25c_0p10.NorthAmerica.202012-202012.nc
