In [1]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

import rasterio
import geopandas as gpd

import iceplant_detection_functions as ipf
import model_prep_and_evals as mpe

import warnings

import planetary_computer as pc
import rioxarray as rioxr

import gc # garbage collector

import dask
import dask_gateway
import dask.array as da

import data_sampling_workflow.lidar_sampling_functions as lsf

In [3]:
t_alpha = time.time()

## Import train/test sets

In [5]:
X_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'r':'avg_lidar']
y_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'iceplant'] 

In [6]:
# remove 2012 and 2014 since these do not have LIDAR data of their own.
X_train = X_train.loc[X_train.year != 2012]
X_train = X_train.loc[X_train.year != 2014]
X_train.year.unique()

array([2016, 2018, 2020])

In [7]:
y_train = y_train.iloc[X_train.index]

In [8]:
mpe.iceplant_proportions(np.array(y_train))

no-iceplant:iceplant ratio     2.6 :1
          counts  percentage
iceplant                    
0         190655       72.35
1          72856       27.65



## Train model

In [9]:
from dask_ml.wrappers import ParallelPostFit

In [10]:
t0 = time.time()

rfc = ParallelPostFit(RandomForestClassifier(n_estimators = 100, random_state = 42))
rfc.fit(X_train, y_train)

print('time to train: ', (time.time()-t0))

time to train:  25.778734922409058


In [11]:
del X_train, y_train
gc.collect()

397

# Select NAIP scene and LIDAR year

In [12]:
# gaviota_itemids = ['ca_m_3412039_nw_10_060_20200522',
# 'ca_m_3412039_nw_10_060_20180724_20190209',
# 'ca_m_3412039_nw_10_.6_20160616_20161004',
# 'ca_m_3412039_nw_10_1_20140603_20141030',
# 'ca_m_3412039_nw_10_1_20120518_20120730']

In [13]:
campus_itemids = ['ca_m_3411934_sw_11_060_20200521',
                  'ca_m_3411934_sw_11_060_20180722_20190209',
                  'ca_m_3411934_sw_11_.6_20160713_20161004']

In [14]:
# **************************************************************
# ********* SPECIFY ITEMID AND LIDAR YEAR TO MATCH HERE ********

itemid = campus_itemids[2] # NAIP scene over Conception Point
lidar_year = 2016
save_raster = True
filename = 'LIDAR+NAIP_campus_'+str(lidar_year)+'_predictions.tif'

# **************************************************************
# **************************************************************

# Open NAIP scene for classification

In [15]:
href = pc.sign(ipf.get_item_from_id(itemid).assets["image"].href)

# rioxarray of complete naip scene
naip_scene = rioxr.open_rasterio(rasterio.open(href))

# Prepare LIDAR data for sampling

In [16]:
# Create auxiliary LIDAR files to sample from
lidar_reader = rasterio.open(lsf.path_to_lidar(lidar_year))
lsf.save_min_max_rasters(lidar_reader, os.getcwd()+'/temp',lidar_year)
lsf.save_avg_rasters(lidar_reader, os.getcwd()+'/temp',lidar_year)

In [17]:
# Resample lidar layers to match NAIP scene resolution and extent

lidar_match = lsf.open_and_match(fp, naip_scene)

fp = os.path.join(os.getcwd(),'temp','lidar_mins_'+str(lidar_year)+'.tif')
mins_match = lsf.open_and_match(fp, naip_scene)

fp = os.path.join(os.getcwd(),'temp','lidar_maxs_'+str(lidar_year)+'.tif')
maxs_match = lsf.open_and_match(fp, naip_scene)

fp = os.path.join(os.getcwd(),'temp','lidar_avgs_'+str(lidar_year)+'.tif')
avg_match = lsf.open_and_match(fp, naip_scene)

NameError: name 'fp' is not defined

In [None]:
# Assemble LIDAR layers into dataframe 

df_lidar = pd.DataFrame()
df_lidar['lidar'] = lidar_match.to_numpy().reshape(avg_match.shape[0]*avg_match.shape[1])
df_lidar['max_lidar'] = maxs_match.to_numpy().reshape(avg_match.shape[0]*avg_match.shape[1])
df_lidar['min_lidar'] = mins_match.to_numpy().reshape(avg_match.shape[0]*avg_match.shape[1])
df_lidar['min_max_diff'] = df_lidar.max_lidar - df_lidar.min_lidar
df_lidar['avg_lidar'] = avg_match.to_numpy().reshape(avg_match.shape[0]*avg_match.shape[1])

In [None]:
# clear memory
del avg_match, mins_match, maxs_match, lidar_match
gc.collect()

In [None]:
# Delete auxiliary LIDAR rasters in temp folder
lidar_fps = []
for tag in ['maxs_', 'mins_', 'avgs_']:
    lidar_fps.append(os.path.join(os.getcwd(),
                                  'temp',
                                  'lidar_'+tag+ str(lidar_year)+'.tif'))
for fp in lidar_fps:
    os.remove(fp)

# Select points to be classified using NDVI threshold

In [None]:
# select pixel with vegetation
veg = ipf.features_over_aoi(ipf.get_item_from_id(itemid), 
                           naip_scene.to_numpy(), 
                           thresh=0.05)
veg

# Sample and add LIDAR to vegetation pixels

In [None]:
# select same indices as vegetation
df_lidar_veg = df_lidar.iloc[veg.index]
df_lidar_veg

In [None]:
# add LIDAR features

# FEATURES MUST BE IN THIS ORDER TO MATCH THE MODEL:
# ['r', 'g', 'b', 'nir', 'ndvi', 
#  'year', 'month', 'day_in_year',
#  'lidar', 'max_lidar', 'min_lidar', 'min_max_diff', 'avg_lidar'])

scene_features = pd.concat([veg,df_lidar_veg],axis=1)
scene_features.head(4)

In [None]:
# clear memory
del veg, df_lidar, df_lidar_veg
gc.collect()

# Make dask data array of pixel values from NAIP scene raster

In [None]:
da_pixels = da.from_array(scene_features.to_numpy(), chunks=728802)
da_pixels

# Initialize Dask Cluster

In [None]:
cluster = dask_gateway.GatewayCluster()
client = cluster.get_client()
cluster.scale(30)
client

# Predict using dask

In [None]:
predictions = rfc.predict(da_pixels)
predictions

In [None]:
t0 = time.time()

preds = predictions.compute()
print('time to predict: ', time.time()-t0)

# Reassemble predictions into image

In [None]:
t0 = time.time()
# attach predictions to index of analyzed pixels
is_iceplant = pd.DataFrame(predictions, columns=['is_iceplant'], index=scene_features.index)

# select pixels classified as iceplant
is_iceplant = is_iceplant[is_iceplant.is_iceplant == 1]
print('select iceplant time: ', time.time()-t0)

In [None]:
t0 = time.time()
preds_image = ipf.indices_backto_image(naip_scene.shape[1], naip_scene.shape[2], is_iceplant.index)
print('indices to image time: ', time.time()-t0)

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
plt.title("LIDAR + SPECTRAL PREDICTIONS : "+str(lidar_year)+ " campus point NAIP scene")
ax.imshow(preds_image)
plt.show()

In [None]:
# save if necessary

if save_raster == True:
    with rasterio.open(
        os.path.join(os.getcwd(),'temp',filename),  # file path
        'w',           # w = write
        driver='GTiff', # format
        height = preds_image.shape[0], 
        width = preds_image.shape[1],
        count = 1,  # number of raster bands in the dataset
        dtype = rasterio.uint8,
        crs = naip_scene.rio.crs,
        transform = naip_scene.rio.transform(),
    ) as dst:
        dst.write(preds_image.astype(rasterio.uint8), 1)

In [None]:
# clean memory
del naip_scene, scene_features
del preds_image, predictions
gc.collect()

In [None]:
print('total time:', time.time() - t_alpha )