In [None]:
import os
import gc
import time
import pandas as pd
import numpy as np

import rasterio
import rioxarray as rioxr
import geopandas as gpd

from shapely.geometry import mapping

from scipy.ndimage import median_filter

import dask_gateway
import dask.array as da

from joblib import load

# custom modules
import data_sampling_workflow.sample_rasters as sr
import raster_to_features as rf

In [None]:
# **************************************************************
# ********* SPECIFY ITEMID AND LIDAR YEAR TO MATCH HERE ********

year = 2014

scene_ids = pd.read_csv(os.path.join(os.getcwd(),
                                     'coastal_scenes_ids',
                                     'coastal_scenes_ids_'+str(year)+'.csv')).itemid#.iloc[3:22]
scene_ids

In [None]:
# read SB coastal zone shapefile for clipping
fp = os.path.join(os.getcwd(),'separating_naip_flights','sb_coast','sb_coast.shp')
coast= gpd.read_file(fp)
coast_geo = coast.geometry.apply(mapping)

In [None]:
# **************************************************************
# Create auxiliary canopy height files to sample from

lidar_rast_reader = rasterio.open(sr.path_to_lidar(year))   # open canopy height raster
rast_name = 'SB_canopy_height_'+str(year) # give a name to canopy height raster

# list of file paths to aux canopy height rasters
# order of filepaths is: lidar, max, min, avg
lidar_fps = [sr.path_to_lidar(year)]  
for tag in ['_maxs', '_mins', '_avgs']:
    lidar_fps.append(os.path.join(os.getcwd(),
                                  'temp',
                                  rast_name+tag+'.tif'))

if not all([os.path.exists(fp) for fp in lidar_fps]):
    t0 = time.time()

    # save aux rasters in temp folder
    if os.path.exists(fp[1]) == False:
        sr.max_raster(rast_reader = lidar_rast_reader, rast_name = rast_name, n=3)

    if os.path.exists(fp[2]) == False:
        sr.min_raster(rast_reader = lidar_rast_reader, rast_name = rast_name, n=3)  

    if os.path.exists(fp[3]) == False:
        sr.avg_raster(rast_reader = lidar_rast_reader, rast_name = rast_name, n=3)

    print('time to make auxiliary rasters: ', (time.time()-t0), 'seconds')

In [None]:
# load pre-trained random forest classifier
model_name = 'lidar_spectral_rfc.joblib'
rfc = load(model_name) 
print('loaded model')

In [None]:
# initialize DASK cluster
cluster = dask_gateway.GatewayCluster()
client = cluster.get_client()
cluster.scale(15)
client

In [None]:
total_0 = time.time()
times = []

for itemid in scene_ids:
    t_alpha = time.time()

    # ---------------------------------------
    # open NAIP scene and clip to coast
    t0 = time.time()
    raster = rf.rioxr_from_itemid(itemid).rio.clip(coast_geo, coast.crs)
    print('clipped raster', time.time() - t0,' s')

    # ---------------------------------------
    # select non-zero pixels as df
    df = rf.raster_as_df(raster.to_numpy(), ['r','g','b','nir'])
    df = df.loc[ (df['nir'] != 0) | (df['r'] != 0) | (df['g'] != 0) | (df['b'] != 0)]
    print('selected non-zero pixels')

    if df.shape[0] == 0:
        print('no data at intersection of scene with coastal buffer')
        print('FINISHED: ', itemid , '\n')

    else:
        # find vegetation pixels to go into model
        # keep ndices of water and low-ndvi pixels
        # adds ndvi and ndwi features for each pixel
        t0 = time.time()
        is_veg, water_index, not_veg_index = rf.add_spectral_features(df, 
                                                                      ndwi_thresh = 0.3, 
                                                                      ndvi_thresh = 0.05) 
        if is_veg.shape[0]==0:
            print('no vegetation pixels at intersection of scene data with coastal buffer')
            print('FINISHED: ', itemid , '\n')

        else:
            # select features
            is_veg.drop('ndwi', axis=1, inplace=True)
            is_veg = rf.add_date_features(is_veg, rf.rioxr_from_itemid(itemid).datetime)
            print('assembled pixels dataframe with features\n   time taken to assemble: ', time.time() - t0,' s')

            # ---------------------------------------
            # Resample canopy height layers to match NAIP scene resolution and extent
            t0 = time.time()

            # resampled canopy height layers as vectors    
            lidar_values = []    
            for fp in lidar_fps:
                match = sr.open_and_match(fp, raster)
                match_vector = match.to_numpy().reshape(match.shape[0]*match.shape[1])
                lidar_values.append(match_vector)


            df_lidar = pd.DataFrame(dict(zip(['lidar', 'max_lidar', 'min_lidar', 'avg_lidar'], lidar_values)))
            df_lidar = df_lidar.assign(min_max_diff =  df_lidar['max_lidar'] - df_lidar['min_lidar'])
            print('time to resample and reshape rasters: ', (time.time()-t0), 'seconds')

            # ****************** add LIDAR features to vegetation dataframe *************************

            scene_features = pd.concat([is_veg, df_lidar.iloc[is_veg.index]], axis=1)

            # reorder columns to match classifier feature order
            feature_order = ['r', 'g', 'b', 'nir', 'ndvi', 
                              'year', 'month', 'day_in_year',
                              'lidar', 'max_lidar', 'min_lidar', 'min_max_diff', 'avg_lidar']

            scene_features = scene_features[feature_order]

            # ---------------------------------------
            #free memory
            del is_veg, df_lidar, match_vector, lidar_values
            gc.collect()

            # ****************** PREDICT USING DASK *****************************
            # convert into dask.array and predict using model
            da_pixels = da.from_array(np.array(scene_features), chunks=728802)
            scene_preds = rfc.predict(da_pixels)
            t0 = time.time()
            preds = scene_preds.compute()
            print('time taken to predict: ', time.time() - t0,' s')


            # ****************** RECONSTRUCT INTO IMAGE *************************
            # recover pixel indices for iceplant classifications
            preds_df = pd.DataFrame(preds, 
                                 columns=['is_iceplant'], 
                                 index = scene_features.index)
            is_iceplant_index = preds_df[preds_df.is_iceplant == 1].index.to_numpy()
            non_iceplant_index = preds_df[preds_df.is_iceplant == 0].index.to_numpy()


            # indices of different categories
            indices = [non_iceplant_index,
                       is_iceplant_index, 
                       not_veg_index,
                       water_index]
            values = [0,    # values assigned to pixels from each index
                      1,
                      2,
                      3]
            t0 = time.time()
            reconstruct = rf.indices_to_image(raster.shape[1], raster.shape[2], indices, values, back_value=100)
            print('reconstructed image\n   time taken to reconstruct: ', time.time() - t0,' s')


            # ****************** APPLY 3x3 MEDIAN FILTER *************************
            if filter:
                t0 = time.time()
                reconstruct = median_filter(reconstruct, size=3)
                print('time taken to filter: ', time.time()- t0,' s')

            # ****************** SAVE PREDICTIONS AS RASTER *********************
            filename = 'LS_clip_filter_preds_' + itemid + '.tif'
            with rasterio.open(
                os.path.join(os.getcwd(), 'temp', filename),  # file path
                'w',           # w = write
                driver = 'GTiff', # format
                height = reconstruct.shape[0], 
                width = reconstruct.shape[1],
                count = 1,  # number of raster bands in the dataset
                dtype = rasterio.uint8,
                crs = raster.rio.crs,
                transform = raster.rio.transform(),
            ) as dst:
                dst.write(reconstruct.astype(rasterio.uint8), 1)
            print('saved predictions')

            # ****************** RECORD TOTAL TIME *********************
            total =  time.time() - t_alpha
            times.append(total)
            print('total time:', total)
            print('FINISHED: ', itemid , '\n')
            
print(time.time() - total_0)

In [None]:
#for i in range(1,4):   # delete aux canopy height rasters
for i in range(1,4):
    os.remove(lidar_fps[i])


In [None]:
times