####################################
# Our early approaches of producing test heat-events-dataset
####################################

In [28]:
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from matplotlib.patches import Rectangle

from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import xarray as xr
import zarr
import fsspec

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 12,8

In [2]:
################################
# Read the dataset (only reads the metadata) - HAWAII
################################

# Ref: 
# https://daac.ornl.gov/DAYMET/guides/Daymet_Daily_V4.html#datasetoverview

import pystac
import fsspec
import xarray as xr

account_name = "daymeteuwest"
container_name = "daymet-zarr"

collection = pystac.Collection.from_file(
    "https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-daily-hi"
)
asset = collection.assets["zarr-https"]

store = fsspec.get_mapper(asset.href)
ds = xr.open_zarr(store, **asset.extra_fields["xarray:open_kwargs"])

In [117]:
################################
# core algorithm to flag heat extremes
################################

# input: 1D tmax data
# output: same-size 1D flags where heat event == True

# algorithm coefs:
TEMP_DIFF_THRESHOLD = 1 # Celcius (or K)
PERSISTED_FOR_MIN = 3 # days
    
def flag_heat_events(arr_tmax1d: np.array, timestamps: np.array) -> np.array:
    """
    # Same logic as in Notebook2, but this time it does not slice the 
    # xarray Dataset every time. Instead, it operates on the numpy array - faster.
    """
    df = pd.DataFrame({'tmax':arr_tmax1d})
    df['mov_avg'] = df.rolling(15, center=True).mean()
    df['diff'] = df['tmax'] - df['mov_avg']
    df['time'] = timestamps

    df['hot'] = df['diff'] > TEMP_DIFF_THRESHOLD 
    df['label'] = df['hot'].diff().ne(False).cumsum()
    df = df.reset_index().reset_index()
    
    # filter
    summer_months = [5,6,7,8,9]
    df['isSummer'] = df['time'].dt.month.isin(summer_months)
    
    dff = df[df['isSummer'] & df['hot']].dropna(subset=['diff']) 
    dfg = dff.groupby('label').agg({
        'index':[np.min,np.max,len],  
    })
    dfg.columns = ['i1','i2','count']
    dfg = dfg[dfg['count'] >= PERSISTED_FOR_MIN]
    dfg = dfg.drop('count', axis=1)
    dfg = dfg.reset_index(drop=True)

    arr = np.zeros(len(df), dtype=int) 
    for _, (i, j) in dfg.iterrows():
        arr[i:j+1] = 1

    return arr

In [84]:
################################
# Utils
################################

def get_xy_meshgrid(arr_tmax3d:np.ndarray) -> np.ndarray:
    """create grid for all x,y coordinate pairs [0,1],[0,2],..[283,583]"""
    
    shape_yx = arr_tmax3d.shape[1:] # np.shape order -> zyx
    arr_y = np.arange(shape_yx[0])
    arr_x = np.arange(shape_yx[1])
    ac = np.array(np.meshgrid(arr_x, arr_y)).T.reshape(-1, 2)
    
    return ac

def print_stats(arr:np.ndarray) -> None:
    size = round(arr_tmax3d.nbytes/1e9,2)
    shp = arr_tmax3d.shape
    print(f"""processing.. year={year}, shape z,y,x={shp}, in-memory={size} GB""")

In [123]:
%%time

import time

years = np.unique(ds['time'].dt.year).astype(str)

for year in years:
    
    t1 = time.time()
    # Read the entire tmax data into memory as np.ndarray
    arr_tmax3d = ds['tmax'].sel(time=year).values
    
    # Create same size empty 3d array to populate with heat event flags
    arr_heat3d = np.zeros(arr_tmax3d.shape).astype(int)
    print_stats(arr_heat3d)

    # loop through all iX,iY pairs
    meshgrid = get_xy_meshgrid(arr_tmax3d)
    for i, j in meshgrid:
        
        arr_tmax1d = arr_tmax3d[:,j,i]
        no_data = np.isnan(arr_tmax1d).all()
        
        if no_data:
            arr_heat1d = np.zeros(arr_tmax1d.shape, dtype=int)
        else:
            timestamps = ds['tmax'].sel(time=year)['time'].values
            arr_heat1d = flag_heat_events(arr_tmax1d, timestamps)

        arr_heat3d[:,j,i] = arr_heat1d  
        
    np.save(f'./arr_heat3d/arr_heat3d-{year}.npy', arr_heat3d)
    print(f'{round((time.time() - t1)/60, 2)}min')

year=1980, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
249.55sec
year=1981, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
247.38sec
year=1982, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
246.92sec
year=1983, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
244.56sec
year=1984, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
240.44sec
year=1985, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
240.96sec
year=1986, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
242.69sec
year=1987, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
237.67sec
year=1988, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
237.9sec
year=1989, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
234.94sec
year=1990, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
234.82sec
year=1991, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
235.09sec
year=1992, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
238.41sec
year=1993, shape z,y,x=(365, 584, 284), in-memory=0.24 GB
238.56sec
year=1994, shape z,y,x=(365, 584, 284), in-memory