# Mask and resample

This is the third notebook in a series of six notebooks designed to run sequentially. \
Run this notebook to apply a mask for the Contiguous United States (CONUS) to the STAR and LOCA2 data and to resample the STAR data to the LOCA2 grid. \
Download the mask from https://github.com/NOAA-CRIS/CRIS/tree/main/data/TSU_land_only_mask_LOCA2grid_STARwaterbodies.tif to <your-base-path>/data

# Import libraries

In [1]:
import time
import os
import logging
import xarray as xr
import numpy as np
from scipy.interpolate import griddata
from osgeo import gdal
from copy import deepcopy

# Define functions

In [2]:
def create_dir(dir):
    """
    Checks if directory exists and if not it will create it
    
    dir: str
        full path the directory
    """
    if not os.path.exists(dir):
            os.mkdir(dir)

In [3]:
def list_dir_return_list_by_selection(workspace, ssp, model):
    """
    Lists all files in a workspace and return the full path

    workspace: str
        full path the directory
    selection: str
        string used as a filter 
    """
    files = []
    for file in os.listdir(workspace):
        if ssp in file and model in file:
            files.append(os.path.join(workspace,file))
    return(files)

In [4]:
def source_to_target_rasters(source_grid, target_grid):

    source_x = source_grid.longitude.values
    source_y = np.flipud(source_grid.latitude.values)
    source_xx, source_yy = np.meshgrid(source_x, source_y)
    source_points = np.vstack((source_xx.ravel(), source_yy.ravel())).T

    target_x = target_grid.lon.values
    target_y = np.flipud(target_grid.lat.values)
    target_xx, target_yy = np.meshgrid(target_x, target_y)

    return source_points, target_xx, target_yy

# Prepare Logger and Open Log File

In [None]:
# Path to your base directory
base = r'C:\<your-base-path>\CRIS'

In [None]:
LOG_FILE_PATH = os.path.join(base, 'logs')

# Setup logging to file and stdout
file_time = time.strftime("%Y%m%d-%H%M%S")

log_file_name = "Output_Log_ResamplingMasking_" + file_time + ".log"
LOG_FILE = os.path.join(LOG_FILE_PATH, log_file_name)

logging.basicConfig(level = logging.INFO,
                    format="%(asctime)s:%(levelname)s: %(message)s",
                    handlers=[
                       logging.FileHandler(filename=LOG_FILE),
                       logging.StreamHandler()
                   ])
logging.getLogger()

# Set paths and variables

In [7]:
input_dir = os.path.join(base, 'data', 'unitconvert')
output_dir = os.path.join(base, 'data', 'resample_mask')

create_dir(output_dir)

In [None]:
# example of the source grid (STAR). it does not matter what the actual model and variable in the file are.
source_grid = xr.open_dataset(os.path.join(base, 'data', 'merge', 'STAR', 'cdd', 'STAR_ACCESS-CM2_cdd_ssp245_1950_2100.nc'))

# example of the correct LOCA2 grid (target). it does not matter what the actual model and variable in the file are.
target_grid = xr.open_dataset(os.path.join(base, 'data', 'merge', 'LOCA2', 'cdd', 'LOCA2_ACCESS-CM2_cdd_ssp245_1950_2100.nc'))

# create raster conversion
source_points, target_xx, target_yy = source_to_target_rasters(source_grid, target_grid)

# load mask
# Download the mask from https://github.com/NOAA-CRIS/CRIS/tree/main/data/TSU_land_only_mask_LOCA2grid_STARwaterbodies.tif to <your-base-path>/data
mask_path = os.path.join(base, 'data', 'TSU_land_only_mask_LOCA2grid_STARwaterbodies.tif')
mask = gdal.Open(mask_path)
mask = mask.GetRasterBand(1).ReadAsArray()
mask = np.flipud(np.where(mask == 3, np.nan, mask))

In [9]:
# Pick STAR or LOCA2
model_set = 'LOCA2'

# pick model
model = 'ACCESS-CM2'

# pick variable
variable = 'tavg'

ssp_list = ['_ssp245','_ssp370','_ssp585']

# Resample and mask

In [None]:
start_proc_tm = time.time() 

# Define a local directory
netcdf_dir_list = os.path.join(input_dir, model_set, variable)

# Define the full directory path
output_dir_model_set = os.path.join(output_dir, model_set)

# Create an output folder for the model set
create_dir(output_dir_model_set)
logging.info(f"Created [{output_dir_model_set}] for processing")

# Define the full directory path
output_dir_model_set_variable = os.path.join(output_dir_model_set, variable)

# Create an output folder for the model variable
create_dir(output_dir_model_set_variable)
logging.info(f"Created [{output_dir_model_set_variable}] for processing")

# Execute the workflow by ssp in a for loop
for ssp in ssp_list:

    ssp_rasters = list_dir_return_list_by_selection(netcdf_dir_list, ssp, model)

    # Strip .xmls from the ssp_rasters list
    ssp_rasters = list(set([str(item).strip(".xml") for item in ssp_rasters]))
    total_recs = len(ssp_rasters)
    logging.info(f"There are [{total_recs}] files in SSP[{ssp.strip('_ssp')}]")
    rec_count = 0

    start_rec_tm = time.time()
    
    for netcdf in ssp_rasters:

        rec_count += 1
        start_rec_tm = time.time()

        # Split the file name by underscore and collect the parts for processing
        file_name_parts = os.path.basename(ssp_rasters[0]).strip('.nc').split("_")

        # path to save output to
        out_raster_path = os.path.join(output_dir, model_set, variable, os.path.basename(netcdf))

        # Collect file names parts
        model_set, model, variable, ssp, start_year, end_year = file_name_parts
        logging.info(f"Processing {os.path.join(netcdf_dir_list, '_'.join([model,variable,ssp]))}")
                
        if model_set == 'STAR':
            
            # create a temporary raster
            temp_raster = np.zeros((target_grid['cdd'].shape))

            # open netcdf
            raster_proc = xr.open_dataset(netcdf)

            # loop through time dimension (151 steps)
            for t in range(raster_proc[variable.replace('-','_')].shape[0]):
                
                print('Processing:', t+1950)

                # convert raster values to 1D array
                source_values = np.flipud(raster_proc[variable.replace('-','_')][t,:,:].values)
                
                source_values_flat = source_values.ravel()

                # resample data
                temp_raster[t,:,:] = griddata(source_points, source_values_flat, (target_xx, target_yy), method='linear')
            
            # Clip temp_raster with mask
            out_raster = temp_raster * mask

            end_red_tm = time.time()
            elapsed_rec_tm = end_red_tm - start_rec_tm
            logging.info(f"[Processed: {os.path.join(netcdf_dir_list, ':', ssp)} in {elapsed_rec_tm:0,.2f} seconds]")

        elif model_set == 'LOCA2':

            # open netcdf
            raster_proc = xr.open_dataset(netcdf)

            # Clip raster with mask
            raster_proc = raster_proc.isel(lat=slice(None, None, -1))
            out_raster = raster_proc[variable.replace('-','_')].values * mask

            end_red_tm = time.time()
            elapsed_rec_tm = end_red_tm - start_rec_tm
            logging.info(f"[Processed: {os.path.join(netcdf_dir_list, ':', ssp)} in {elapsed_rec_tm:0,.2f} seconds]")

        # create an independent copy of target_grid and repopulate it with the newly calculated out_raster
        final_grid = deepcopy(target_grid).rename({'cdd': variable.replace('-','_')})
        final_grid[variable.replace('-','_')][:,:,:] = np.flip(out_raster, axis=1)
        
        # save the regridded and masked data as netcdf
        final_grid.to_netcdf(out_raster_path, encoding={variable.replace('-','_'): {'zlib': True, 'complevel': 9}})

logging.info("Processing complete.")
end_proc_tm = time.time()

elapsed_proc_tm = end_proc_tm - start_proc_tm

logging.info(f"[Overall Elapsed time: {elapsed_proc_tm/60:0,.2f} minutes]")