In [1]:
import os
import gc   # garbage collector
import time
import pandas as pd
import numpy as np

import rasterio
import rioxarray as rioxr

import dask_gateway
import dask.array as da

from joblib import load

import geopandas as gpd
from shapely.geometry import mapping

from scipy.ndimage import median_filter

# custom modules
import raster_to_features as rf
import data_sampling_workflow.sample_rasters as sr

from skimage.feature import graycomatrix, graycoprops

In [2]:
# year for which to predict iceplant locations
year = 2020
window_r = 5

# whether to apply median filter to raster
filter_rasters = True

# whether to save rasters
save_rasters = True
prefix = 'glcm2020_model'


# whether to print processing info at runtime
verbose = False

save_processing_times = False

In [3]:
# **************************************************************
# open shapefile of SB coastal buffer and process it to use it for clipping
fp = os.path.join(os.getcwd(), 
                  'separating_naip_flights', 
                  'SB_coastal_buffer', 
                  'SB_coastal_buffer.shp')
coast = gpd.read_file(fp)
coast_geo = coast.geometry.apply(mapping)

# **************************************************************
# load pre-trained random forest classifier
rfc = load(prefix+'_rfc.joblib') 

# **************************************************************
# select the scene ids from given year that intersect the coastal buffer
# the itemids of all scenes that intersect the coast were previously stored in a csv
# scene_ids = pd.read_csv(os.path.join(os.getcwd(),
#                                      'separating_naip_flights',
#                                      'coastal_scenes_ids.csv'))
# scene_ids = scene_ids.loc[scene_ids['year'] == year]
# scene_ids = scene_ids.reset_index().itemid

#scene_ids = ['ca_m_3412037_nw_10_060_20200607',
             # 'ca_m_3412039_nw_10_060_20200522',
             # 'ca_m_3412040_ne_10_060_20200522',
             # 'ca_m_3411934_sw_11_060_20200521',
             # 'ca_m_3411936_se_11_060_20200521']

scene_ids = ['ca_m_3411935_se_11_060_20200521']

# **************************************************************
# prepare folder to save rasters
if save_rasters:
    fp = os.path.join(os.getcwd(), 'processing_results')
    if os.path.exists(fp) == False:
        os.mkdir(fp)
    if filter_rasters:
        fp = os.path.join(fp, prefix+'_filter_clip_preds_' + str(year))
    else:
        fp = os.path.join(fp, prefix+'_clip_preds_' + str(year))
    if os.path.exists(fp) == False:
        os.mkdir(fp)

In [4]:
# initialize DASK cluster
cluster = dask_gateway.GatewayCluster()
cluster.scale(15)

client = cluster.get_client()
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.e3268597c1f3453ebc74b4a64cb01c56/status,


In [5]:
# ---------------------------------------
# collect processing information for each scene
times_pre = []
times_class = []
times_post = []
processed = []
reason = []
veg_pixels = [] # number of pixels with ndwi<0.3 and ndwi>0.05
n_pixels = []   # number of non-zero pixels in masked scene

# parameters for GLCM 
distances = [1]     
angles = [np.pi/2] # North

# counter for scenes queued for processing
N = len(scene_ids)
t_total = time.time()

# ---------------------------------------
# ---------------------------------------

for itemid in scene_ids:
    t_alpha = time.time()
    
    # ***********************************************************************************************
    # *************************************** PRE-PROCESSING ****************************************
    # open NAIP scene and clip to coast
    t0 = time.time()
    raster = rf.rioxr_from_itemid(itemid).rio.clip(coast_geo, coast.crs)

    # ---------------------------------------
    # select pixels with data (blacked out portions have 0 on all bands)
    df = rf.raster_as_df(raster.to_numpy(), ['r','g','b','nir'])
    df = df.loc[ (df['nir'] != 0) | (df['r'] != 0) | (df['g'] != 0) | (df['b'] != 0)]
    n_pixels.append(df.shape[0])

    # ---------------------------------------
    # stop if there's no data at intersection
    if df.shape[0] == 0:
        rf.finish_processing('no_data', processed, reason, times_pre, times_class, times_post, veg_pixels, itemid)
        if verbose:
            rf.finish_processing_message('no_data', itemid)

    else:
        # find vegetation pixels to go into model
        # keep ndices of water and low-ndvi pixels
        # add ndvi and ndwi features for each pixel
        t0 = time.time()
        is_veg, water_index, not_veg_index = rf.add_spectral_features(df, 
                                                                      ndwi_thresh = 0.3, 
                                                                      ndvi_thresh = 0.05) 
      # ---------------------------------------
        # stop if there are no vegetation pixels at intersection
        if is_veg.shape[0] == 0:
            rf.finish_processing('no_veg', processed, reason, times_pre, times_class, times_post, veg_pixels, itemid)            
            if verbose:
                rf.finish_processing_message('no_veg', itemid)

        else:
            processed.append('Y')
            reason.append('processed')  
            
            # ---------------------------------------
            # discard ndwi and add date features
            is_veg.drop('ndwi', axis=1, inplace=True)
            is_veg = rf.add_date_features(is_veg, rf.rioxr_from_itemid(itemid).datetime)


    # *************************************************************************************************
    # ******************************** CREATE R,G,B,NIR AUXILIARY RASTERS *****************************
            t0 = time.time()
            # make auxiliary spectral rasters from clipped NAIP 
            band_names = ['r_', 'g_', 'b_', 'nir_']
            tags = ['_avgs', '_entrs']
            window_fps = []
            window_cols = []

            for name, band in zip(band_names,range(1,5)):
                rast_name = name+itemid
                sr.avg_raster(raster=raster, band=band, rast_name=rast_name, n=3)
                sr.entropy_raster(raster=raster, band=band, rast_name=rast_name, n=window_r)

                for tag in tags:
                    window_fps.append(os.path.join(os.getcwd(), 'temp', rast_name + tag + '.tif'))        
                    window_cols.append( name.replace('_','')+tag.replace('s',''))
            # print('created R,G,B,NIR auxiliary rasters (avgs,entr)', time.time()-t0)
            

In [9]:
# *************************************************************************************
# ******************************** CREATE TEXTURE RASTERS *****************************            
y_len = raster.shape[1]
x_len = raster.shape[2]
for band_n, band_name in zip(range(4),band_names):
    if band_n != 1:  # no texture features for Green band    
        contrast = np.zeros((y_len,x_len))
        correlation = np.zeros((y_len,x_len))

        for y in range(y_len):
            for x in range(x_len):
                window = raster[:, y-window_r:y+window_r+1, x-window_r:x+window_r+1]
                if (window.shape[1] != 2*window_r+1) | (window.shape[2] != 2*window_r+1):
                    contrast[y,x] = np.nan
                    correlation[y,x] = np.nan                                
                else:
                        glcm = graycomatrix(window[band_n], distances=distances, angles=angles) # assuming 1 distance and 1 angle
                        contrast[y,x] = graycoprops(glcm, 'contrast')[0,0]
                        correlation[y,x] = graycoprops(glcm, 'correlation')[0,0]

        sr.save_raster_checkpoints(contrast, crs=raster.rio.crs, transf=raster.rio.transform(), rast_name=band_name+'contrast')
        sr.save_raster_checkpoints(correlation, crs=raster.rio.crs, transf=raster.rio.transform(), rast_name=band_name+'correlation')

        window_fps.append(os.path.join(os.getcwd(), 'temp',band_name + 'contrast.tif')) 
        window_fps.append(os.path.join(os.getcwd(), 'temp',band_name + 'correlation.tif')) 

        window_cols = window_cols + [band_name+'cont', band_name+'corr']

del contrast, correlation
gc.collect()

Exception in callback None()
handle: <Handle cancelled>
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 1391, in _do_ssl_handshake
    self.socket.do_handshake()
  File "/srv/conda/envs/notebook/lib/python3.10/ssl.py", line 1342, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:997)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 189, in _handle_events
    handler_func(fileobj, events)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 696, in _handle_events
    self._ha

KeyboardInterrupt: 

In [14]:
x_len*y_len

55714550

  self.scheduler_comm.close_rpc()
2023-01-26 00:28:20,895 - distributed.client - ERROR - Exception raised while shutting down cluster prod.e3268597c1f3453ebc74b4a64cb01c56
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/dask_gateway/client.py", line 993, in _stop_internal
    await self.gateway._stop_cluster(self.name)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/dask_gateway/client.py", line 654, in _stop_cluster
    await self._request("DELETE", url)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/dask_gateway/client.py", line 397, in _request
    resp = await session.request(method, url, json=json, **self._request_kwargs)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/aiohttp/client.py", line 535, in _request
    conn = await self._connector.connect(
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/aiohttp/connector.py", line 542, in connect
    proto = await self._create_conn

In [13]:
y_len

5281

In [7]:
    # *******************************************************************************************
    # *********************** EXTRACT FEATURES FROM AUXILIARY RASTERS ***************************
            window_values = []    
            for fp_aux in window_fps:
                match = rioxr.open_rasterio(fp_aux).squeeze()
                match_vector = match.to_numpy().reshape(match.shape[0]*match.shape[1])
                window_values.append(match_vector)
                os.remove(fp_aux)

            df_window = pd.DataFrame(dict(zip( window_cols, window_values)))

            scene_features = pd.concat([is_veg, df_window.iloc[is_veg.index]], axis=1)

    # **********************************************************************************************
    # *************** REMOVE NA VALUES (PIXELS AT EDGE OF CLIPPED PART OF RASTER) ******************
           # remove these indices from scene_features
            # no need to add them anywhere else, they will be part of the raster's background
            remove = scene_features[scene_features.r_cont.isna() == True].index
            scene_features = scene_features.drop(remove)

            #free memory            
            del df_window, window_values, match_vector, match, remove
            gc.collect()
            
            times_pre.append(time.time()-t0)
       
    # ******************************************************************************
    # ******************************** ORDER FEATURES ****************************** 

            feature_order = [
                         'r', 'r_avg', 'r_entr','r_cont', 'r_corr', 
                         'g', 'g_avg', 'g_entr',
                         'b', 'b_avg', 'b_entr','b_cont', 'b_corr', 
                         'nir', 'nir_avg', 'nir_entr','nir_cont', 'nir_corr', 
                         'month', 'day_in_year'] # date

            scene_features = scene_features[feature_order]
            # print('finished assembling features')


    # ***********************************************************************************************
    # *************************************** CLASSIFICATION ****************************************
            # convert into dask.array and predict using model
            da_pixels = da.from_array(np.array(scene_features), chunks=728802)
            scene_preds = rfc.predict(da_pixels)
            
            # ---------------------------------------
            t0 = time.time()
            preds = scene_preds.compute()
            times_class.append(time.time() - t0)
            # print('finished classification')

    # ************************************************************************************************
    # *************************************** POST-PROCESSING ****************************************
            # recover pixel indices for iceplant classifications
            t0 = time.time()
            preds_df = pd.DataFrame(preds, 
                                 columns=['is_iceplant'], 
                                 index = scene_features.index)
            is_iceplant_index = preds_df[preds_df.is_iceplant == 1].index.to_numpy()
            non_iceplant_index = preds_df[preds_df.is_iceplant == 0].index.to_numpy()

            # ---------------------------------------
            # reconstruct indices into image
            indices = [non_iceplant_index,
                       is_iceplant_index, 
                       not_veg_index,
                       water_index]
            values = [0,    # values assigned to pixels from each index
                      1,
                      2,
                      3]
            reconstruct = rf.indices_to_image(raster.shape[1], raster.shape[2], indices, values, back_value=100)
            
            # ---------------------------------------
            # apply median 3x3 filter if needed
            if filter:
                reconstruct = median_filter(reconstruct, size=3)
            
            times_post.append(time.time() - t0)
            # print('finished post-processing')

    # ************************************************************************************************
    # *************************************** SAVE RASTERS *******************************************  
            if save_rasters:
                if filter_rasters:
                    filename = prefix+'_filter_clip_preds_' + itemid + '.tif'
                else:
                    filename = prefix+'_clip_preds_' + itemid + '.tif'
                
                with rasterio.open(
                    os.path.join(fp, filename),  # file path
                    'w',           # w = write
                    driver = 'GTiff', # format
                    height = reconstruct.shape[0], 
                    width = reconstruct.shape[1],
                    count = 1,  # number of raster bands in the dataset
                    dtype = rasterio.uint8,
                    crs = raster.rio.crs,
                    transform = raster.rio.transform(),
                ) as dst:
                    dst.write(reconstruct.astype(rasterio.uint8), 1)
                    
            if verbose:
                print('FINISHED: ', itemid)        
                

    # ************************************************************************************************
    # ************************************ FINAL INFO MESSAGE ***************************************            
    N = N-1        
    if verbose:
        print('REMAINING: ', N, 'scenes', '\n')
    else:
        print('REMAINING: ', N, 'scenes', end="\r")
    
print('TOTAL TIME: ', (time.time() - t_total)/60, ' mins')

IndentationError: unexpected indent (3277583864.py, line 3)

In [6]:
#save times processed and itemids as dataframe
D = { 'itemid': scene_ids,
     'processed': processed,
     'reason':reason,
     'pre_times': times_pre,
     'class_times' : times_class,
     'post_times' : times_post, 
     'processed_pix' : n_pixels }
processing_df = pd.DataFrame(D)

In [7]:
if save_processing_times:
    fp = os.path.join(os.getcwd(),'processing_results')
    if os.path.exists(fp) == False:
        os.mkdir(fp)


    if filter_rasters:
        filename = prefix+'_filter_clip_processing_results_' + str(year) + '.csv'
    else:
        filename = prefix+'_clip_processing_results_' + str(year) + '.csv'

    processing_df.to_csv(os.path.join(fp, filename ), index=False)