In [1]:
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from matplotlib.patches import Rectangle

from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import xarray as xr
import zarr
import fsspec

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 12,8

# !pip install opencv-python-headless
import cv2
from matplotlib.patches import Rectangle
from typing import List, Tuple
import itertools
from copy import copy, deepcopy
import os
import shutil

import getpass
import azure.storage.blob
from azure.storage.blob import BlobClient, BlobServiceClient
from azure.core.exceptions import ResourceExistsError


In [3]:
sas_token = getpass.getpass() 
sas_token

 ········································································································································


'sp=r&st=2021-10-11T16:31:19Z&se=2021-10-12T00:31:19Z&spr=https&sv=2020-08-04&sr=b&sig=cCSjGFZI9gbcvydy%2BIPECv67XUUrhFGHTB2%2FUxAN0pM%3D'

In [4]:
def download_file(sas_url, filename, overwrite_local_file):
    """
    Downloads the specified file from Azure blob storage
    If asked to not overwrite, then first checks if the file is available locally and does not download again in that case.
    - sas_url   : url complete with sas token
    - filename  : name of the downloaded file
    - overwrite_local_file : if True, will overwrite, else, if already available locally, will not download again.
    Returns:  None
    """
    if overwrite_local_file or not (os.path.isfile(filename)):        
        blob_client = BlobClient.from_blob_url(sas_url)
        with open(filename, "wb") as my_blob:
            download_stream = blob_client.download_blob()
            my_blob.write(download_stream.readall())

In [5]:
filename = f"10_year_Heatwaves_Data_CMIP6_ssp245__Region_1__2020_to_2029.nc.nc"
url_prefix = 'https://nasanex30analysis.blob.core.windows.net/cmip6/heatwaves/'

if not url_prefix is None:
    if not url_prefix[-1] == '/':
        url_prefix += '/'
    sas_url = url_prefix + filename + "?" + sas_token    
    download_file(sas_url, filename, overwrite_local_file=False)

In [10]:
ds = xr.open_mfdataset('./Ravi_output/*.nc', concat_dim="time", combine="nested")
arr3d = ds['heatwave_yn'].values

In [20]:
rows = []
num_days = arr3d.shape[0]
for i in range(num_days):
    arr2d = arr3d[i,:,:]
    boxes = array2boxes(arr2d)
    rows += [dict(time=i, boxes=boxes)]
df = pd.DataFrame(rows)
df['hasEvent'] = df['boxes'].apply(lambda x: len(x)) > 0
df['label'] = df['hasEvent'].diff().ne(False).cumsum()
    
df

Unnamed: 0,time,boxes,hasEvent,label
0,0,"[(0, 0, 161, 71)]",True,1
1,1,"[(0, 0, 161, 71)]",True,1
2,2,"[(0, 0, 161, 71)]",True,1
3,3,"[(0, 0, 161, 71)]",True,1
4,4,"[(0, 0, 161, 71)]",True,1
...,...,...,...,...
3645,3645,"[(0, 0, 161, 71)]",True,1
3646,3646,"[(0, 0, 161, 71)]",True,1
3647,3647,"[(0, 0, 161, 71)]",True,1
3648,3648,"[(0, 0, 161, 71)]",True,1


In [29]:
# dff = df[df['hasEvent']].iloc[0:2]
# dfg = dff.groupby('label').agg({
#     'time':[np.min,np.max], 
#     'boxes':lambda _: collapse(np.sum(_))
# }).reset_index()
# dfg
dff

Unnamed: 0,time,boxes,hasEvent,label
0,0,"[(0, 0, 161, 71)]",True,1
1,1,"[(0, 0, 161, 71)]",True,1


In [28]:
collapse([(0, 0, 161, 71)])

[(0, 0, 161, 71)]

In [60]:
####################################
# INPUT: read heatwave algorithm output dataset
####################################

def enter_sas_token():
    """Obtain sas token from Azure, and enter to the input box that opens."""
    sas_token = getpass.getpass()
    return sas_token
    
def import_dataset(url_prefix:str, filename:str, sas_token:str) -> np.array:

    if url_prefix[-1] != '/':
        url_prefix += '/' 
    
    sas_url = url_prefix + filename + "?" + sas_token    

    if os.path.isfile(filename):
        raise FileExistsError
    else:
        blob_client = BlobClient.from_blob_url(sas_url)
        with open(filename, "wb") as my_blob:
            download_stream = blob_client.download_blob()
            my_blob.write(download_stream.readall())
  
    ds = xr.open_mfdataset(filename, concat_dim="time", combine="nested")

    return ds

In [58]:
url_prefix = 'https://nasanex30analysis.blob.core.windows.net/cmip6/'
sas_token = enter_sas_token() # of the whole "cmip6" folder in Azure.

In [66]:
# import row temp data:
filename = f"10_year_max_temp_CMIP6__Region_1__2020_to_2029.nc"
ds_temp = import_dataset(url_prefix+"10_year_temp/", filename, sas_token)

In [68]:
# import tem_avg data:
filename = f"Avg_temp_max_CMIP6__30_yrs__1950_to_1979.nc"
ds_avg = import_dataset(url_prefix+"averages/", filename, sas_token)

In [69]:
# import heatwave flag data:
filename = f"10_year_Heatwaves_Data_CMIP6_ssp245__Region_1__2020_to_2029.nc.nc"
ds_heat = import_dataset(url_prefix+"heatwaves/", filename, sas_token)

In [55]:
####################################
# PROCESS: BOUNDING-BOXES ALGORITHM to extract metadata
####################################

"""
Pipeline to run the bounding-box algorithm to find frame local heat events. Processes by year.

Input: the np arrays produced by the 1st process before this pipeline. These are the same size 
        arrays that have a heat event flag per iX,iY. But each location of unaware from each other. This
        pipe aggregates them in x,y axeses first, then in time axis.
Output: 3D Bounding boxes of all heat events.
"""

def bounding_boxes(arr2d: np.array) -> List[tuple]:
    
    H = arr2d.astype(np.uint8)
    ret, thresh = cv2.threshold(H, 0, 1, 0, cv2.THRESH_BINARY)
    contours, hier = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    boxes = [cv2.boundingRect(c) for c in contours]
    return boxes

def isoverlap(box1:tuple, box2:tuple) -> bool:
    """Return True if two windows overlap"""
    x1,y1,w1,h1 = box1
    x2,y2,w2,h2 = box2
    return not (x2>x1+w1 or x2+w2<x1 or y2>y1+h1 or y2+h2<y1)

def outer(box1:tuple, box2:tuple) -> tuple:
    """Fuse two windows into one, parent window."""
    x1,y1,w1,h1 = box1
    x2,y2,w2,h2 = box2
    x = min(x1,x2)
    y = min(y1,y2)
    w = max(x1+w1,x2+w2)-x
    h = max(y1+h1,y2+h2)-y
    return (x, y, w, h)

def istiny(box:tuple, min_area:int) -> bool:
    x,y,w,h = box
    return w*h <= min_area

def filter_tiny_ones(boxes:List[tuple]) -> List[tuple]:
    return [c for c in boxes if not istiny(c, 10)]

def collapse(boxes:List[tuple]) -> List[tuple]:
    
    for box1, box2 in itertools.combinations(boxes, 2):
        if isoverlap(box1,box2):
            boxes.remove(box1)
            boxes.remove(box2)
            boxes.append(outer(box1,box2))
            return collapse(boxes) # recursion

    boxes.sort(key=lambda _:_[0])
    return boxes

def array2boxes(arr2d:np.array) -> List[tuple]:
    """Pipeline. Takes a time-slice (2D array) and returns the boxes."""
    boxes = bounding_boxes(arr2d)
    boxes = filter_tiny_ones(boxes)
    boxes = collapse(boxes)
    return boxes

def groupby_heat_events(arr3d:np.array) -> List[dict]:
    rows = []
    num_days = arr3d.shape[0]

    for i in range(num_days):
        arr2d = arr3d[i,:,:]
        boxes = array2boxes(arr2d)
        rows += [dict(time=i, boxes=boxes)]
    df = pd.DataFrame(rows)
    df['hasEvent'] = df['boxes'].apply(lambda x: len(x)) > 0
    df['label'] = df['hasEvent'].diff().ne(False).cumsum()

    dff = df[df['hasEvent']]

    dfg = dff.groupby('label').agg({
        'time':[np.min,np.max], 
        'boxes':lambda _: collapse(np.sum(_))
    }).reset_index()
    dfg.columns = ['label', 'i1', 'i2', 'boxes']
    dfg = dfg.assign(d1=dr[dfg['i1']], d2=dr[dfg['i2']])
    dfg = dfg.drop('label', axis=1)
    
    return dfg

# TODO: delete this later. Temporarily, I have to use my old np output.
year = 1971
dr = pd.date_range(start=f'1/1/{year}', periods=365, freq='D').date
arr3d = np.load(f'CMIP5_flagged/arr_heat3d-{year}.npy')
#-----

df_events = groupby_heat_events(arr3d)
df_events


Unnamed: 0,i1,i2,boxes,d1,d2
0,133,140,"[(56, 121, 90, 75)]",1971-05-14,1971-05-21
1,155,162,"[(134, 177, 5, 5), (216, 160, 86, 40)]",1971-06-05,1971-06-12
2,167,169,"[(158, 165, 12, 7)]",1971-06-17,1971-06-19
3,175,180,"[(70, 125, 31, 29), (85, 157, 5, 4), (91, 185,...",1971-06-25,1971-06-30
4,182,195,"[(104, 155, 58, 30), (129, 103, 78, 47)]",1971-07-02,1971-07-15
5,203,206,"[(219, 160, 43, 15)]",1971-07-23,1971-07-26
6,210,222,"[(64, 167, 15, 8), (148, 152, 96, 33), (251, 1...",1971-07-30,1971-08-11
7,224,226,"[(127, 101, 7, 2)]",1971-08-13,1971-08-15
8,228,247,"[(141, 116, 102, 67)]",1971-08-17,1971-09-05
9,249,253,"[(135, 102, 15, 24), (192, 72, 10, 5)]",1971-09-07,1971-09-11


In [None]:
####################################
# PROCESS: Generate Images of detected-heat events
####################################

# # temp data
# ds = xr.open_mfdataset('Koray/CMIP5/*.nc')

# # Ravi's avg 
# ds_avg = xr.open_dataset("Koray/Avg_temp_max_CMIP5__30_yrs__1950_to_1979.nc")


for year in range(1970,1980):

    dr = pd.date_range(start=f'1/1/{year}', periods=365, freq='D').date
    arr3d = np.load(f'Koray/CMIP5_flagged/arr_heat3d-{year}.npy') # must be ready
    df_events = groupby_heat_events(arr3d)

    for ev, (i1, i2) in df_events[['i1','i2']].iterrows():
        # to write the image to disk
        folder_name = f"{year}-event{str(ev).zfill(2)}"
        folder_path = f"Koray/CMIP5_images/{folder_name}"
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        
        # loop through heat events, some are 3 days long, some are 20
        for i, idx in enumerate(range(i1,i2+1)):

            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,4))

            day = dr[idx].strftime("%Y-%m-%d")
            tasmax = ds['tasmax'].sel(time=day, **area)
            tavg3d = ds_avg['tasmaxavg'].sel(day=idx, **area)
            tdiff = tasmax - tavg3d

            im1 = tdiff.squeeze().plot.imshow(ax=ax1, cmap='Reds', vmin=-4, vmax=4)
            im2 = (tasmax-273.15).squeeze().plot.imshow(ax=ax2, vmin=5, vmax=50)

            # add bounding boxes
            boxes = df_events['boxes'].iloc[ev]
            for b in boxes:
                x,y,w,h = b
                x = float(tdiff.coords['lon'][x])
                y = float(tdiff.coords['lat'][y])
                _, leny, lenx = tdiff.shape

                w = w * (lon_max-lon_min) / lenx
                h = h * (lat_max-lat_min) / leny 

                rect = Rectangle((x, y), w, h, color='b', fill=False, linewidth=2)
                ax1.add_patch(rect)
                rect = Rectangle((x, y), w, h, color='b', fill=False, linewidth=2)
                ax2.add_patch(rect)

            fig.tight_layout()

            # save each image, each event has multiple images, one per event day.
            filename = f"{folder_path}/{day}.png"
            fig.savefig(filename, dpi=fig.dpi)
            plt.close(fig)

            # upload image to azure
            sas_url = f"{url_prefix}/{blob_folder}/{filename}?{sas_token}"
            blob_client = BlobClient.from_blob_url(sas_url)
            with open(filename, "rb") as f:
                try:
                    blob_client.upload_blob(f)
                except ResourceExistsError:
                    pass
    
        # flush VM disk space
        shutil.rmtree(folder_path)

