# Calculate zonal statistics

This is the fifth script in a series of six scripts designed to run sequentially. \
Run this script to calculate zonal statistics (mean) for each model set-model combination and for each multimodel weighted ensemble. \
This script uses a local file for the zone features, including counties, watershed boundaries, and tribal areas for CONUS. \
Download the zones (CRIS_Zonal_Stat_Zones_CONUS_4326.gdb) from https://cris.climate.gov/datasets/93ed19088971459f952b598a08ab0ec2/about to <your-base-path>/zonalstats.

# Import libraries

In [1]:
import time
import os
import logging
import pandas as pd
import geopandas as gpd
import traceback
import numpy as np
import xarray as xr
import rasterio
from rasterio.transform import from_origin
from rasterstats import zonal_stats

# Define functions

In [2]:
def create_dir(dir):
    """
    Checks if directory exists and if not it will create it
    
    dir: str
        full path the directory
    """
    if not os.path.exists(dir):
            os.mkdir(dir)

In [3]:
def list_dir_return_list_by_selection(workspace, ssp, model):
    """
    Lists all files in a workspace and return the full path

    workspace: str
        full path the directory
    selection: str
        string used as a filter 
    """
    files = []
    for file in os.listdir(workspace):
        if ssp in file and model in file:
            files.append(os.path.join(workspace,file))
    return(files)

In [4]:
def netcdf_to_tiff(netcdf, variable, t, temp_dir):

    # Define the path to the raster file
    ds = xr.open_dataset(netcdf)
    data = ds[variable.replace('-','_')].isel(time=t)

    # Get the spatial dimensions and resolution
    lon = ds['lon'].values - 360
    lat = ds['lat'].values
    xsize = (lon[1] - lon[0])
    ysize = -(lat[1] - lat[0])

    year = t + 1950

    # shift lon.min() and lat.max() by 0.5*pixel size
    lonmin = lon.min() - xsize/2
    latmax = lat.max() + ysize/2

    transform = from_origin(lonmin, latmax, xsize, ysize)
    
    output_tiff = os.path.join(temp_dir, os.path.basename(netcdf)).strip('.nc') + '_temp_' + str(year) + '.tif'

    # Create an in-memory raster layer using rasterio
    with rasterio.open(
                    output_tiff,
                    'w',
                    driver='GTiff',
                    height=data.shape[0],
                    width=data.shape[1],
                    count=1,
                    dtype=data.dtype,
                    crs='+proj=latlong',
                    transform=transform,
    ) as dst:
                    dst.write(np.flipud(data.values), 1)

    return dst, output_tiff

# Prepare Logger and Open Log File

In [None]:
# Path to your base directory
base = r'C:\<your-base-path>\CRIS'

In [6]:
LOG_FILE_PATH = os.path.join(base, 'logs')

FILE_TIME = time.strftime("%Y%m%d-%H%M%S")
LOG_FILE_NAME = "Output_Log_zonalstats_first_" + FILE_TIME + ".log"
LOG_FILE = os.path.join(LOG_FILE_PATH, LOG_FILE_NAME)

In [7]:
logging.basicConfig(level = logging.INFO,
                    format="%(asctime)s:%(levelname)s: %(message)s",
                    handlers=[
                       logging.FileHandler(filename=LOG_FILE),
                       logging.StreamHandler()
                   ])
logging.getLogger()

<RootLogger root (INFO)>

# Set paths and variables

In [8]:
# Load the zones: CONUS Counties, Hydrologic Unit Codes (HUC8), and Tribal Areas.
# Download the zones (CRIS_Zonal_Stat_Zones_CONUS_4326.gdb) from https://cris.climate.gov/datasets/93ed19088971459f952b598a08ab0ec2/about to <your-base-path>/zonalstats
input_zone = os.path.join(base, 'zonalstats\CRIS_Zonal_Stat_Zones_CONUS_4326.gdb')

input_dir = os.path.join(base, 'data')
output_dir = os.path.join(base, 'zonalstats', 'out')
temp_dir = os.path.join(base, 'zonalstats', 'temp')

create_dir(temp_dir)
create_dir(output_dir)

In [9]:
# pick 'STAR', 'LOCA2', 'STAR_ensemble', 'LOCA2_ensemble', or 'LOCA2STAR_ensemble'
model_set_start = 'LOCA2STAR_ensemble'

# pick model (only for STAR and LOCA2)
model = 'ACCESS-CM2'

# pick variable
variable = 'tavg'

ssp_list = ['_ssp585'] #['_ssp245','_ssp370','_ssp585']

proc_list = ['HUC8'] #['TribalAreas', 'Counties', 'HUC8']

# Calculate zonal statistics

In [None]:
# Set cutoff year between historical (1950-2014) and projection (2015-2100)
cutoff_year = 2015 - 1950

for proc in proc_list:

    # Load the zone data from the GDB file
    zone_data = gpd.read_file(input_zone, layer=proc)
    
    # Remove column 'Shape_Length'
    if proc == 'Counties':
        zone_data = zone_data.drop(columns=['SHAPE_Length'])
    elif proc == 'TribalAreas' or proc == 'HUC8':
        zone_data = zone_data.drop(columns=['Shape_Length'])

    # Define the number of rows per value
    num_rows = len(zone_data)
    
    # Repeat zone_data for each year
    zone_data = pd.concat([zone_data] * 151, ignore_index=True)

    # Generate year values
    values = np.concatenate([np.full(num_rows, value) for value in range(1950, 2101)])

    # Create geodataframe of years
    repeated_years = gpd.GeoDataFrame({'Standard Time': values})

    # Add years to zone_data
    zone_data['Standard Time'] = repeated_years['Standard Time']
    
    # Add empty statistics to zone_data
    stat_columns = gpd.GeoDataFrame(index=range(num_rows*151))
    stat_columns['MEAN'] = None
    zone_data['MEAN'] = stat_columns['MEAN']

    # Change column name 'Shape_Area' to 'AREA'
    zone_data = zone_data.rename(columns={'Shape_Area': 'AREA'})

    # Create output directories
    output_dir_model_set = os.path.join(output_dir, model_set_start)
    create_dir(output_dir_model_set)

    output_proc = os.path.join(output_dir_model_set, proc)
    create_dir(output_proc)

    # Create directory paths
    if model_set_start == 'STAR' or model_set_start =='LOCA2':

        netcdf_dir_list = os.path.join(input_dir, 'resample_mask', model_set_start, variable)

    elif model_set_start == 'STAR_ensemble' or model_set_start == 'LOCA2_ensemble' or model_set_start == 'LOCA2STAR_ensemble':

        netcdf_dir_list = os.path.join(input_dir, 'ensemble', model_set_start.strip('_ensemble'))

    # Execute the zonal statistics workflow by ssp in a for loop
    for ssp in ssp_list:

        # List all netcdf files in the directory
        if model_set_start == 'STAR' or model_set_start =='LOCA2':

            ssp_rasters = list_dir_return_list_by_selection(netcdf_dir_list, ssp, model)

        elif model_set_start == 'STAR_ensemble' or model_set_start == 'LOCA2_ensemble' or model_set_start == 'LOCA2STAR_ensemble':

            ssp_rasters = list_dir_return_list_by_selection(netcdf_dir_list, ssp, '')

        # Strip .xmls from the ssp_rasters list
        ssp_rasters = list(set([str(item).strip(".xml") for item in ssp_rasters]))
        total_recs = len(ssp_rasters)
        logging.info(f"There are [{total_recs}] files in SSP[{ssp.strip('_ssp')}]")
        rec_count = 0

        start_rec_tm = time.time()
        
        for netcdf in ssp_rasters:

            rec_count += 1
            start_rec_tm = time.time()

            # Create path to save output to
            out_raster_path = os.path.join(output_dir, model_set_start, variable, os.path.basename(netcdf))

            # Split the file name by underscore and collect the parts for processing
            file_name_parts = os.path.basename(netcdf).strip('.nc').split("_")

            if model_set_start == 'STAR' or model_set_start =='LOCA2':

                # Collect file names parts
                model_set, model, variable, ssp, start_year, end_year = file_name_parts
                logging.info(f"Processing {os.path.join(netcdf_dir_list, '_'.join([model,variable,ssp]))}")

                raster_name = '_'.join([model_set, model, variable, ssp.strip('_')]).replace('-','')

            elif model_set_start == 'STAR_ensemble' or model_set_start == 'LOCA2_ensemble' or model_set_start == 'LOCA2STAR_ensemble':

                # Collect file names parts
                model_set, variable, ssp, start_year, end_year = file_name_parts
                logging.info(f"Processing {os.path.join(netcdf_dir_list, '_'.join([variable,ssp]))}")

                raster_name = '_'.join([model_set_start, variable, ssp.strip('_')]).replace('-','')

            # Add column for variable
            zone_data['Variable'] = variable.replace('-','_')

            # Change order of columns
            zone_data = zone_data.reindex(columns=['GEOID', 'AREA', 'Variable', 'Standard Time', 'MEAN', 'geometry'])

            start = time.perf_counter()

            try:
                
                # Loop over years
                for year in range(1950,2101):

                    print(year)
                    t = year-1950
                    
                    # Load the zone data geometries from the GDB file
                    zone_data_temp = gpd.read_file(input_zone, layer=proc)
                    zone_data_geometry = zone_data_temp.geometry
            
                    # Convert the netcdf to tiff
                    dst, output_tiff = netcdf_to_tiff(netcdf, variable, t, temp_dir)

                    # Calculate zonal statistics
                    stats = zonal_stats(zone_data_geometry, dst.name, stats=['mean'], geojson_out=False)
                    
                    # Add the statistics to the GeoDataFrame
                    zone_data.loc[t*num_rows : (t+1)*num_rows - 1, 'MEAN'] = [stat['mean'] for stat in stats]
                    
                    # Delete temporary tiff file so there is no buildup of old tiffs that are not being used anymore
                    os.remove(output_tiff)

                # Remove column 'geometry'
                zone_data = zone_data.drop(columns=['geometry'])

                # Take out only the projection data
                zone_data_projection = zone_data.loc[cutoff_year*num_rows:]

                # Create path and name of projection .CSV table
                output_table_projection = os.path.join(output_proc, raster_name + '.csv')

                # Export dataframe as .CSV file
                zone_data_projection.to_csv(output_table_projection, index=False)

                # Take out only the historical data
                zone_data_historical = zone_data.loc[:cutoff_year*num_rows-1]
                
                # Create path and name of historical .CSV table
                output_table_historical = os.path.join(output_proc, raster_name.replace('_ssp245','_historical').replace('_ssp370','_historical').replace('_ssp585','_historical') + '.csv')

                # Export dataframe as .CSV file
                zone_data_historical.to_csv(output_table_historical, index=False)
            
            except Exception as e:
                logging.info("Process encountered an error while processing {netcdf}:\n{str(e)}\n"\
                        f"{traceback.format_exc()}")
                
    # finally:
    finish = time.perf_counter()
    duration = round(finish-start, 3)
    logging.info("Time to process rasters in %s:  %s secs", netcdf, duration)