In [1]:
import os
import gc   # garbage collector
import time
import pandas as pd
import numpy as np

import rasterio
import rioxarray as rioxr

import dask_gateway
import dask.array as da

from joblib import load

import geopandas as gpd
from shapely.geometry import mapping

from scipy.ndimage import median_filter

# custom modules
#import raster_to_features as rf
import A_data_sampling_workflow.sample_rasters as sr
import shutil

In [2]:
# year for which to predict iceplant locations
year = 2020

# whether to apply median filter to raster
filter_rasters = False
filter_side = 5

# whether to save rasters
save_rasters = False
prefix = 'salt13_p30'

# whether to print processing info at runtime
verbose = True

save_processing_times = True

delete_aux_rasters = True

# **************************************************************
clip = True

# **************************************************************
# whether only to process aois
only_aois = False

# **************************************************************
uses_ndvi = True

# **************************************************************
# radius of the disk (in pixels) over which entropy is calculated
entropy_r = 6

# features for snow13
feature_order = ['r', 'r_avg13', 'r_entr13', 
                 'g', 'g_avg13', 'g_entr13', 
                 'b', 'b_avg13', 'b_entr13', 
                 'nir', 'nir_avg13', 'nir_entr13', 
                 'ndvi', 'ndvi_avg13', 'ndvi_entr13', 
                 'month', 'day_in_year']

In [3]:
# length of side of the square window over which average/max/min are calculated.
box_side = entropy_r *2 +1

# **************************************************************
# open shapefile of SB coastal buffer and process it to use it for clipping
fp = os.path.join(os.getcwd(), 
                  'separating_naip_flights', 
                  'SB_coastal_buffer', 
                  'SB_coastal_buffer.shp')
coast = gpd.read_file(fp)
coast_geo = coast.geometry.apply(mapping)

# **************************************************************
# load pre-trained random forest classifier
rfc = load(prefix+'_rfc.joblib') 

# **************************************************************

if only_aois:
    scene_ids = ['ca_m_3412037_nw_10_060_20200607',
                 'ca_m_3412039_nw_10_060_20200522',
                 'ca_m_3412040_ne_10_060_20200522',
                 'ca_m_3411934_sw_11_060_20200521',
                 'ca_m_3411936_se_11_060_20200521']
else:    
    # select the scene ids from given year that intersect the coastal buffer
    # the itemids of all scenes that intersect the coast were previously stored in a csv
    scene_ids = pd.read_csv(os.path.join(os.getcwd(),
                                         'separating_naip_flights',
                                         'coastal_scenes_ids.csv'))
    scene_ids = scene_ids.loc[scene_ids['year'] == year]
    scene_ids = scene_ids.reset_index().itemid

# **************************************************************
# prepare folder to save rasters
if save_rasters:
    fp = os.path.join(os.getcwd(), 'processing_results')
    if os.path.exists(fp) == False:
        os.mkdir(fp)
    if filter_rasters:
        fp = os.path.join(fp, prefix+'_filter_clip_preds_' + str(year))
    else:
        fp = os.path.join(fp, prefix+'_clip_preds_' + str(year))
    if os.path.exists(fp) == False:
        os.mkdir(fp)

In [4]:
# # initialize DASK cluster
# cluster = dask_gateway.GatewayCluster()
# cluster.scale(15)

# client = cluster.get_client()
# client

In [5]:
# ---------------------------------------
# collect processing information for each scene
times_access = []
times_pre = []
times_features = []
times_class = []
times_post = []
processed = []
reason = []
veg_pixels = [] # number of pixels with ndwi<0.3 and ndwi>0.05
n_pixels = []   # number of non-zero pixels in masked scene

# counter for scenes queued for processing
N = len(scene_ids)

# total time initial point
t_total = time.time()

# ---------------------------------------
# ---------------------------------------

for itemid in scene_ids:    
    # ***********************************************************************************************
    # *************************************** DATA ACCESS ****************************************
    # open NAIP scene and clip to coast
    t0 = time.time()
    raster = sr.rioxr_from_itemid(itemid)
    times_access.append(time.time()-t0)
    
    # ***********************************************************************************************
    # *************************************** PRE-PROCESSING ****************************************
    t0 = time.time()
    if clip:
        raster = raster.rio.clip(coast_geo, coast.crs)

    # ---------------------------------------
    # select pixels with data (blacked out portions have 0 on all bands)
    df = sr.raster_as_df(raster.to_numpy(), ['r','g','b','nir'])
    df = df.loc[ (df['nir'] != 0) | (df['r'] != 0) | (df['g'] != 0) | (df['b'] != 0)]
    n_pixels.append(df.shape[0])

    # ---------------------------------------
    # stop if there's no data at intersection
    if df.shape[0] == 0:
        times_pre.append(time.time()-t0)
        sr.finish_processing('no_data', processed, reason, 
                             times_features, times_class, times_post, 
                             veg_pixels, itemid)
        if verbose:
            sr.finish_processing_message('no_data', itemid)

    else:
        # find vegetation pixels to go into model
        # keep indices of water and low-ndvi pixels
        # add ndvi and ndwi features for each pixel
        if verbose:
            print('selected data on scene')
        
        is_veg, water_index, not_veg_index = sr.add_spectral_features(df, 
                                                                      ndwi_thresh = 0.3, 
                                                                      ndvi_thresh = 0.05) 
      # ---------------------------------------
        # stop if there are no vegetation pixels at intersection
        if is_veg.shape[0] == 0:
            times_pre.append(time.time()-t0)            
            sr.finish_processing('no_veg', processed, reason, 
                                 times_features, times_class, times_post, 
                                 veg_pixels, itemid)            
            if verbose:
                sr.finish_processing_message('no_veg', itemid)

        else:
            times_pre.append(time.time()-t0)
            processed.append('Y')
            reason.append('processed')
            if verbose:
                print('selected vegetation pixels')
    # **************************************************************************************
    # ******************************** START CREATING FEATURES *****************************            
            t0 = time.time()          
            # ---------------------------------------
            # discard ndwi and add date features
            is_veg.drop('ndwi', axis=1, inplace=True)
            is_veg = sr.add_date_features(is_veg, sr.rioxr_from_itemid(itemid).datetime)


    # *************************************************************************************************
    # ******************************** CREATE R,G,B,NIR AUXILIARY RASTERS *****************************
            # make auxiliary spectral rasters from clipped NAIP 
            band_names = ['r_', 'g_', 'b_', 'nir_']
            tags = ['_avgs', '_entrs']
            window_fps = []
            window_cols = []

            for name, band in zip(band_names,range(1,5)):
                rast_name = name+itemid
                
                for tag in tags:
                    rast_fp = os.path.join(os.getcwd(), 'temp', rast_name + tag + '.tif')
                    window_fps.append(rast_fp)        
                    window_cols.append( name.replace('_','')+tag.replace('s',str(box_side)))
                    
                    if os.path.isfile(rast_fp) == False:
                        if tag == '_avgs':
                            sr.avg_raster(raster=raster, band=band, rast_name=rast_name, n=box_side)                            
                        elif tag == '_entrs':
                            sr.entropy_raster(raster=raster, band=band, rast_name=rast_name, n=entropy_r)
                
            if verbose:
                print('created/verified R,G,B,NIR auxiliary rasters (avgs, entr)')


    # ********************************************************************************************
    # ******************************** CREATE NDVI AUXILIARY RASTERS *****************************
            if uses_ndvi:
                # make auxiliary NDVI from clipped NAIP 
                ndvi = sr.ndvi_xarray(raster)
                band_names.append('ndvi_')
                rast_name = 'ndvi_'+itemid
                
                for tag in tags:
                    rast_fp = os.path.join(os.getcwd(), 'temp', rast_name + tag + '.tif')
                    window_fps.append(rast_fp)        
                    window_cols.append('ndvi' + tag.replace('s',str(box_side)))                
                    
                    if os.path.isfile(rast_fp) == False:
                        if tag == '_avgs':
                            sr.avg_raster(rast_data=ndvi, 
                                          crs=raster.rio.crs, 
                                          transf=raster.rio.transform(), 
                                          rast_name=rast_name, 
                                          n=box_side)
                        elif tag == '_entrs':
                            # adjusting to entropy input types
                            ndvi = ndvi*100+100  
                            sr.entropy_raster(rast_data=ndvi.astype('uint8'), 
                                              crs=raster.rio.crs, 
                                              transf=raster.rio.transform(), 
                                              rast_name=rast_name, 
                                              n=entropy_r)
                if verbose:
                    print('created/verified NDVI auxiliary rasters (avgs,entr)')
                #free memory
                del ndvi
                gc.collect()


    # *******************************************************************************************
    # *********************** EXTRACT FEATURES FROM AUXILIARY RASTERS ***************************
            window_values = []    
            for fp_aux in window_fps:
                match = rioxr.open_rasterio(fp_aux).squeeze()
                match_vector = match.to_numpy().reshape(match.shape[0]*match.shape[1])
                window_values.append(match_vector)
                if delete_aux_rasters:
                    os.remove(fp_aux)

            df_window = pd.DataFrame(dict(zip( window_cols, window_values)))

            scene_features = pd.concat([is_veg, df_window.iloc[is_veg.index]], axis=1)

            # **********************************************************************************************
            # *************** REMOVE NA VALUES (PIXELS AT EDGE OF CLIPPED PART OF RASTER) ******************
            # combine indices for r_min == 0 and ndvi_min == nan, ndvi_avg == nan

            # **** THIS NEEDS TO BE NOT HARDCODED: r_min11 and ndvi_min11 ****
            remove = set()
            if '_min' in tags:
                remove = scene_features.r_min11[scene_features.r_min11 == 0].index
            if uses_ndvi:
                for band in [x.replace('s',str(box_side)) for x in ['ndvi'+ y for y in tags]]:
                    remove = remove.union(scene_features[band][scene_features[band].isna() == True].index)
            # remove these indices from scene_features
            # no need to add them anywhere else, they will be part of the raster's background
            scene_features = scene_features.drop(remove)

            #free memory            
            del df_window, window_values, match_vector, match, remove
            gc.collect()  

            # ******************************************************************************
            # ******************************** ORDER FEATURES ****************************** 

            scene_features = scene_features[feature_order]
            times_features.append(time.time()-t0)            
            if verbose:
                print('finished assembling features')

            # ***********************************************************************************************
            # *************************************** CLASSIFICATION ****************************************
            t0 = time.time()             
            # convert into dask.array and predict using model
            #da_pixels = da.from_array(np.array(scene_features), chunks=728802)
            da_pixels = np.array(scene_features)
            scene_preds = rfc.predict(da_pixels)

            # ---------------------------------------

            #preds = scene_preds.compute()
            preds = scene_preds
            
            times_class.append(time.time() - t0)            
            if verbose:
                print('finished classification')

            # ************************************************************************************************
            # *************************************** POST-PROCESSING ****************************************
            t0 = time.time()            
            # recover pixel indices for iceplant classifications
            preds_df = pd.DataFrame(preds, 
                                 columns=['is_iceplant'], 
                                 index = scene_features.index)
            is_iceplant_index = preds_df[preds_df.is_iceplant == 1].index.to_numpy()
            non_iceplant_index = preds_df[preds_df.is_iceplant == 0].index.to_numpy()

            # ---------------------------------------
            # reconstruct indices into image
            indices = [non_iceplant_index,
                       is_iceplant_index, 
                       not_veg_index,
                       water_index]
            values = [0,    # values assigned to pixels from each index
                      1,
                      2,
                      3]
            reconstruct = sr.indices_to_image(raster.shape[1], raster.shape[2], indices, values, back_value=100)

            # ---------------------------------------
            # apply median 3x3 filter if needed
            if filter:
                reconstruct = median_filter(reconstruct, size=filter_side)

            times_post.append(time.time() - t0)
            if verbose:
                print('finished post-processing')

            # ************************************************************************************************
            # *************************************** SAVE RASTERS *******************************************  
            if save_rasters:
                if filter_rasters:
                    filename = prefix+'_filter_clip_preds_' + itemid + '.tif'
                else:
                    filename = prefix+'_clip_preds_' + itemid + '.tif'

                with rasterio.open(
                    os.path.join(fp, filename),  # file path
                    'w',           # w = write
                    driver = 'GTiff', # format
                    height = reconstruct.shape[0], 
                    width = reconstruct.shape[1],
                    count = 1,  # number of raster bands in the dataset
                    dtype = rasterio.uint8,
                    crs = raster.rio.crs,
                    transform = raster.rio.transform(),
                ) as dst:
                    dst.write(reconstruct.astype(rasterio.uint8), 1)

            if verbose:
                print('FINISHED: ', itemid)        

    # ***********************************************************************************************
    # ************************************ FINAL INFO MESSAGE ***************************************            
    N = N-1        
    if verbose:
        print('REMAINING: ', N, 'scenes', '\n')
    else:
        print('REMAINING: ', N, 'scenes', end="\r")

total_time = time.time() - t_total
print('TOTAL TIME: ', (total_time)/60, ' mins')

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412037_nw_10_060_20200607
REMAINING:  23 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412037_ne_10_060_20200607
REMAINING:  22 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412029_sw_10_060_20200607
REMAINING:  21 scenes 

no data at intersection of scene with coastal buffer
FINISHED:  ca_m_3412004_sw_10_060_20200607 
REMAINING:  20 scenes 

no data at intersection of scene with coastal buffer
FINISHED:  ca_m_3412003_se_10_060_20200607 
REMAINING:  19 scenes 



  return data.astype(dtype, **kwargs)


selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412003_ne_10_060_20200607
REMAINING:  18 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412038_nw_10_060_20200523
REMAINING:  17 scenes 



  return data.astype(dtype, **kwargs)


selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412040_nw_10_060_20200522
REMAINING:  16 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412040_ne_10_060_20200522
REMAINING:  15 scenes 



  return data.astype(dtype, **kwargs)


selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412039_nw_10_060_20200522
REMAINING:  14 scenes 



  return data.astype(dtype, **kwargs)


selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412039_ne_10_060_20200522
REMAINING:  13 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3412038_ne_10_060_20200522
REMAINING:  12 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411933_nw_11_060_20200522
REMAINING:  11 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411933_ne_11_060_20200522
REMAINING:  10 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411945_nw_11_060_20200521
REMAINING:  9 scenes 



  return data.astype(dtype, **kwargs)


selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411937_sw_11_060_20200521
REMAINING:  8 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411936_sw_11_060_20200521
REMAINING:  7 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411936_se_11_060_20200521
REMAINING:  6 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411935_sw_11_060_20200521
REMAINING:  5 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411935_se_11_060_20200521
REMAINING:  4 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411934_sw_11_060_20200521
REMAINING:  3 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411934_se_11_060_20200521
REMAINING:  2 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411933_sw_11_060_20200521
REMAINING:  1 scenes 

selected data on scene
selected vegetation pixels
created/verified R,G,B,NIR auxiliary rasters (avgs, entr)


  return data.astype(dtype, **kwargs)


created/verified NDVI auxiliary rasters (avgs,entr)
finished assembling features
finished classification
finished post-processing
FINISHED:  ca_m_3411933_se_11_060_20200521
REMAINING:  0 scenes 

TOTAL TIME:  117.83814803759257  mins


In [6]:
#save times processed and itemids as dataframe
D = { 'itemid': scene_ids,
     'processed': processed,
     'reason':reason,
     'access_times': times_access,
     'pre_times': times_pre,
     'fts_times': times_features,
     'class_times' : times_class,
     'post_times' : times_post, 
     'processed_pix' : n_pixels }
processing_df = pd.DataFrame(D)
processing_df

Unnamed: 0,itemid,processed,reason,access_times,pre_times,fts_times,class_times,post_times,processed_pix
0,ca_m_3412037_nw_10_060_20200607,Y,processed,1.587782,26.360348,557.220994,79.87214,13.112481,28546681
1,ca_m_3412037_ne_10_060_20200607,Y,processed,0.688008,20.211365,201.448281,16.53633,4.423868,19165736
2,ca_m_3412029_sw_10_060_20200607,Y,processed,0.546734,14.261548,93.635355,9.971226,2.360255,4940263
3,ca_m_3412004_sw_10_060_20200607,N,no data in intersection,0.347309,8.381728,0.0,0.0,0.0,0
4,ca_m_3412003_se_10_060_20200607,N,no data in intersection,0.388489,12.236419,0.0,0.0,0.0,0
5,ca_m_3412003_ne_10_060_20200607,Y,processed,0.285535,13.81072,115.54319,3.842109,2.513716,2379409
6,ca_m_3412038_nw_10_060_20200523,Y,processed,0.81632,20.790342,239.157969,29.535114,5.377177,19305320
7,ca_m_3412040_nw_10_060_20200522,Y,processed,0.353707,20.224323,227.793397,45.927072,5.195697,19429096
8,ca_m_3412040_ne_10_060_20200522,Y,processed,0.417923,18.940705,202.633986,70.38844,4.682322,19698261
9,ca_m_3412039_nw_10_060_20200522,Y,processed,0.438335,19.371392,173.227708,60.328789,3.963371,18733821


In [7]:
if save_processing_times:
    fp = os.path.join(os.getcwd(),'processing_results')
    if os.path.exists(fp) == False:
        os.mkdir(fp)


    if filter_rasters:
        filename = prefix+'_filter_clip_processing_results_' + str(year) + '.csv'
    else:
        filename = prefix+'_clip_processing_results_BIS_' + str(year) + '.csv'

    processing_df.to_csv(os.path.join(fp, filename ), index=False)

In [8]:
with open("TOTAL_TIME.txt", "w") as text_file:
    text_file.write("Total time " + str(total_time))