In [1]:
import os
import gc   # garbage collector
import time
import pandas as pd
import numpy as np

import rasterio

import dask_gateway
import dask.array as da

from joblib import load

import geopandas as gpd
from shapely.geometry import mapping

from scipy.ndimage import median_filter

# custom modules
import raster_to_features as rf
import data_sampling_workflow.sample_rasters as sr

In [2]:
# year for which to predict iceplant locations
year = 2020

# whether to apply median filter to raster
filter_rasters = True

# whether to save rasters
save_rasters = True

In [3]:
# **************************************************************
# Create auxiliary canopy height files to sample from

# open canopy height raster for given year
lidar_rast_reader = rasterio.open(sr.path_to_lidar(year))   
# name of output canopy height raster
rast_name = 'SB_canopy_height_' + str(year) 

# list of file paths to aux canopy height rasters
# order of filepaths is: lidar, max, min, avg
lidar_fps = [sr.path_to_lidar(year)]  
for tag in ['_maxs', '_mins', '_avgs']:
    lidar_fps.append(os.path.join(os.getcwd(),
                                  'temp',
                                  rast_name + tag + '.tif'))

# create any missing aux raster
if not all([os.path.exists(fp) for fp in lidar_fps]):
    t0 = time.time()

    # save aux rasters in temp folder
    if os.path.exists(lidar_fps[1]) == False:  # starts at 1 bc 0 is canopy height raster
        sr.max_raster(rast_reader = lidar_rast_reader, rast_name = rast_name, n=3)

    if os.path.exists(lidar_fps[2]) == False:
        sr.min_raster(rast_reader = lidar_rast_reader, rast_name = rast_name, n=3)  

    if os.path.exists(lidar_fps[3]) == False:
        sr.avg_raster(rast_reader = lidar_rast_reader, rast_name = rast_name, n=3)

    print('time to make auxiliary rasters: ', (time.time() - t0), 'seconds')

time to make auxiliary rasters:  22.281490564346313 seconds


In [4]:
# ---------------------------------------
# open shapefile of SB coastal buffer and process it to use it for clipping
fp = os.path.join(os.getcwd(), 
                  'separating_naip_flights', 
                  'SB_coastal_buffer', 
                  'SB_coastal_buffer.shp')
coast = gpd.read_file(fp)
coast_geo = coast.geometry.apply(mapping)

# ---------------------------------------
# load pre-trained random forest classifier
rfc = load('lidar_spectral_rfc.joblib') 
print('loaded model')

# ---------------------------------------
# select the scene ids from given year that intersect the coastal buffer
# the itemids of all scenes that intersect the coast were previously stored in a csv
scene_ids = pd.read_csv(os.path.join(os.getcwd(),
                                     'separating_naip_flights',
                                     'coastal_scenes_ids.csv'))
scene_ids = scene_ids.loc[scene_ids['year'] == year]
scene_ids = scene_ids.reset_index().itemid
scene_ids

loaded model


0     ca_m_3412037_nw_10_060_20200607
1     ca_m_3412037_ne_10_060_20200607
2     ca_m_3412029_sw_10_060_20200607
3     ca_m_3412004_sw_10_060_20200607
4     ca_m_3412003_se_10_060_20200607
5     ca_m_3412003_ne_10_060_20200607
6     ca_m_3412038_nw_10_060_20200523
7     ca_m_3412040_nw_10_060_20200522
8     ca_m_3412040_ne_10_060_20200522
9     ca_m_3412039_nw_10_060_20200522
10    ca_m_3412039_ne_10_060_20200522
11    ca_m_3412038_ne_10_060_20200522
12    ca_m_3411933_nw_11_060_20200522
13    ca_m_3411933_ne_11_060_20200522
14    ca_m_3411945_nw_11_060_20200521
15    ca_m_3411937_sw_11_060_20200521
16    ca_m_3411936_sw_11_060_20200521
17    ca_m_3411936_se_11_060_20200521
18    ca_m_3411935_sw_11_060_20200521
19    ca_m_3411935_se_11_060_20200521
20    ca_m_3411934_sw_11_060_20200521
21    ca_m_3411934_se_11_060_20200521
22    ca_m_3411933_sw_11_060_20200521
23    ca_m_3411933_se_11_060_20200521
Name: itemid, dtype: object

In [5]:
# initialize DASK cluster
cluster = dask_gateway.GatewayCluster()
cluster.scale(15)

client = cluster.get_client()
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.7c1f0223375349e59a52a37f3edd0c83/status,


In [6]:
# ---------------------------------------
# collect processing information for each scene
times_pre = []
times_class = []
times_post = []
processed = []
reason = []
n_pixels = []

# counter for scenes queued for processing
N = len(scene_ids)

# ---------------------------------------
# prepare folder to save rasters
if save_rasters:
    fp = os.path.join(os.getcwd(),'temp')
    if os.path.exists(fp) == False:
        os.mkdir(fp)
    if filter_rasters:
        fp = os.path.join(fp, 'LS_filter_clip_preds_' + str(year))
    else:
        fp = os.path.join(fp, 'temp', 'LS_clip_preds_' + str(year))
    if os.path.exists(fp) == False:
        os.mkdir(fp)

# ---------------------------------------
# ---------------------------------------

for itemid in scene_ids:
    t_alpha = time.time()
    
    # ***********************************************************************************************
    # *************************************** PRE-PROCESSING ****************************************
    # open NAIP scene and clip to coast
    t0 = time.time()
    raster = rf.rioxr_from_itemid(itemid).rio.clip(coast_geo, coast.crs)

    # ---------------------------------------
    # select pixels with data (blacked out portions have 0 on all bands)
    df = rf.raster_as_df(raster.to_numpy(), ['r','g','b','nir'])
    df = df.loc[ (df['nir'] != 0) | (df['r'] != 0) | (df['g'] != 0) | (df['b'] != 0)]

    # ---------------------------------------
    # stop if there's no data at intersection
    if df.shape[0] == 0:
        processed.append('N')
        reason.append('no data in intersection')
        times_pre.append(0)
        times_class.append(0)        
        times_post.append(0)
        n_pixels.append(0)
        N = N-1
        print('no data at intersection of scene with coastal buffer')
        print('FINISHED: ', itemid , '\n')

    else:
        # find vegetation pixels to go into model
        # keep ndices of water and low-ndvi pixels
        # add ndvi and ndwi features for each pixel
        t0 = time.time()
        is_veg, water_index, not_veg_index = rf.add_spectral_features(df, 
                                                                      ndwi_thresh = 0.3, 
                                                                      ndvi_thresh = 0.05) 
      # ---------------------------------------
        # stop if there are no vegetation pixels at intersection
        if is_veg.shape[0] == 0:
            processed.append('N')
            reason.append('no vegeatation in intersection')
            times_pre.append(0)
            times_class.append(0)        
            times_post.append(0)
            n_pixels.append(0)
            N = N-1
            print('no vegetation pixels at intersection of scene data with coastal buffer')
            print('FINISHED: ', itemid , '\n')

        else:
            processed.append('Y')
            reason.append('processed')  
            
            # ---------------------------------------
            # discard ndwi and add date features
            is_veg.drop('ndwi', axis=1, inplace=True)
            is_veg = rf.add_date_features(is_veg, rf.rioxr_from_itemid(itemid).datetime)

            # ---------------------------------------
            # Resample canopy height layers to match NAIP scene resolution and extent

            # resampled canopy height layers as vectors    
            lidar_values = []    
            for fp_aux in lidar_fps:
                match = sr.open_and_match(fp_aux, raster)
                match_vector = match.to_numpy().reshape(match.shape[0]*match.shape[1])
                lidar_values.append(match_vector)

            df_lidar = pd.DataFrame(dict(zip(['lidar', 'max_lidar', 'min_lidar', 'avg_lidar'], lidar_values)))
            df_lidar = df_lidar.assign(min_max_diff =  df_lidar['max_lidar'] - df_lidar['min_lidar'])

            # ---------------------------------------
            #  add canopy height features to vegetation dataframe 
            scene_features = pd.concat([is_veg, df_lidar.iloc[is_veg.index]], axis=1)

            # reorder columns to match classifier feature order
            feature_order = ['r', 'g', 'b', 'nir', 'ndvi', 
                              'year', 'month', 'day_in_year',
                              'lidar', 'max_lidar', 'min_lidar', 'min_max_diff', 'avg_lidar']

            scene_features = scene_features[feature_order]
            
            # ---------------------------------------
            times_pre.append(time.time() - t0)
            n_pixels.append(len(is_veg))       
            print('finished pre-processing')

            # ---------------------------------------
            #free memory
            del is_veg, df_lidar, match_vector, lidar_values
            gc.collect()

    # ***********************************************************************************************
    # *************************************** CLASSIFICATION ****************************************
            # convert into dask.array and predict using model
            da_pixels = da.from_array(np.array(scene_features), chunks=728802)
            scene_preds = rfc.predict(da_pixels)
            
            # ---------------------------------------
            t0 = time.time()
            preds = scene_preds.compute()
            times_class.append(time.time() - t0)
            print('finished classification')

    # ************************************************************************************************
    # *************************************** POST-PROCESSING ****************************************
            # recover pixel indices for iceplant classifications
            t0 = time.time()
            preds_df = pd.DataFrame(preds, 
                                 columns=['is_iceplant'], 
                                 index = scene_features.index)
            is_iceplant_index = preds_df[preds_df.is_iceplant == 1].index.to_numpy()
            non_iceplant_index = preds_df[preds_df.is_iceplant == 0].index.to_numpy()

            # ---------------------------------------
            # reconstruct indices into image
            indices = [non_iceplant_index,
                       is_iceplant_index, 
                       not_veg_index,
                       water_index]
            values = [0,    # values assigned to pixels from each index
                      1,
                      2,
                      3]
            reconstruct = rf.indices_to_image(raster.shape[1], raster.shape[2], indices, values, back_value=100)
            
            # ---------------------------------------
            # apply median 3x3 filter if needed
            if filter:
                reconstruct = median_filter(reconstruct, size=3)
            
            times_post.append(time.time() - t0)
            print('finished post-processing')

    # ************************************************************************************************
    # *************************************** SAVE RASTERS *******************************************  
            if save_rasters:
                if filter_rasters:
                    filename = 'LS_filter_clip_preds_' + itemid + '.tif'
                else:
                    filename = 'LS_clip_preds_' + itemid + '.tif'
                
                with rasterio.open(
                    os.path.join(fp, filename),  # file path
                    'w',           # w = write
                    driver = 'GTiff', # format
                    height = reconstruct.shape[0], 
                    width = reconstruct.shape[1],
                    count = 1,  # number of raster bands in the dataset
                    dtype = rasterio.uint8,
                    crs = raster.rio.crs,
                    transform = raster.rio.transform(),
                ) as dst:
                    dst.write(reconstruct.astype(rasterio.uint8), 1)
                
    # ************************************************************************************************
    # ************************************ FINAL INFO MESSAGE ***************************************            
            N = N-1
            print('total time:', time.time() - t_alpha)
            print('FINISHED: ', itemid)
            print('REMAINING: ', N, 'scenes \n')

finished pre-processing
finished classification
finished post-processing
total time: 85.81719446182251
FINISHED:  ca_m_3412037_nw_10_060_20200607
REMAINING:  23 scenes 

finished pre-processing
finished classification
finished post-processing
total time: 49.578001499176025
FINISHED:  ca_m_3412037_ne_10_060_20200607
REMAINING:  22 scenes 

finished pre-processing
finished classification
finished post-processing
total time: 30.285654306411743
FINISHED:  ca_m_3412029_sw_10_060_20200607
REMAINING:  21 scenes 

no data at intersection of scene with coastal buffer
FINISHED:  ca_m_3412004_sw_10_060_20200607 

no vegetation pixels at intersection of scene data with coastal buffer
FINISHED:  ca_m_3412003_se_10_060_20200607 

finished pre-processing
finished classification
finished post-processing
total time: 37.09280824661255
FINISHED:  ca_m_3412003_ne_10_060_20200607
REMAINING:  18 scenes 

finished pre-processing
finished classification
finished post-processing
total time: 48.50507354736328
F

In [7]:
#save times processed and itemids as dataframe
D = { 'itemid': scene_ids,
     'processed': processed,
     'reason':reason,
     'pre_times': times_pre,
     'class_times' : times_class,
     'post_times' : times_post, 
     'n_pixels' : n_pixels}
processing_df = pd.DataFrame( D )

fp = os.path.join(os.getcwd(),'processing_results')
if os.path.exists(fp) == False:
    os.mkdir(fp)

    
if filter_rasters:
    filename = 'LS_filter_clip_processing_results_' + str(year) + '.csv'
else:
    filename = 'LS_clip_processing_results_' + str(year) + '.csv'
    
processing_df.to_csv(os.path.join(fp, filename ), index=False)

In [8]:
#for i in range(1,4):   # delete aux canopy height rasters
for i in range(1,4):
    os.remove(lidar_fps[i])


2022-10-28 23:25:23,236 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
Exception in callback None()
handle: <Handle cancelled>
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 1391, in _do_ssl_handshake
    self.socket.do_handshake()
  File "/srv/conda/envs/notebook/lib/python3.10/ssl.py", line 1342, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:997)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 189, in _handle_events
    handler_func(fileobj, events)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packag