In [1]:
import pandas as pd
import os

import planetary_computer as pc
import data_sampling_workflow.sample_rasters as sr

import xarray as xr
import rioxarray as rioxr

import dask_gateway
import numpy as np

In [2]:
# initialize DASK cluster
cluster = dask_gateway.GatewayCluster()
cluster.scale(30)
client = cluster.get_client()
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.f46d3ef153d9438cbf5c4fdef4df742f/status,


In [3]:
campus_lagoon_ids = ['ca_m_3411934_sw_11_060_20200521',
'ca_m_3411934_sw_11_060_20180722_20190209',
'ca_m_3411934_sw_11_.6_20160713_20161004',
'ca_m_3411934_sw_11_1_20140601_20141030',
'ca_m_3411934_sw_11_1_20120505_20120730']

years = [2020, 2018, 2016, 2014, 2012]

In [4]:
means = []
sds = []

for itemid in campus_lagoon_ids:
    
    item = sr.get_item_from_id(itemid)    # locate raster and open
    href = pc.sign(item.assets["image"].href)
    scene = rioxr.open_rasterio(href, chunks=True)  # open raster as dask array (MAYBE FIGURE OUT BETTER CHUNK SIZES)
    
    means.append(scene.mean(dim=['x','y']))
    sds.append(scene.std(dim=['x','y']))

all_means = xr.concat(means, dim = pd.Index(years, name="year"))
all_sds = xr.concat(sds, dim = pd.Index(years, name="year"))

In [5]:
all_sds = all_sds.compute()
all_sds

In [6]:
all_means = all_means.compute()
all_means

In [7]:
# import dask.dataframe as dd

In [7]:
# open train data & select features and labels
# select features from r (Red band) to avg_lidar
# excludes x,y, pts_crs, aoi, naip_id, polygon_id, iceplant and lidar features
X_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv'))
#X_train = dd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv'))

X_train = X_train.loc[X_train['aoi']=='campus_lagoon']
y_train = X_train.iceplant
X_train.head(4)

Unnamed: 0,x,y,pts_crs,aoi,naip_id,polygon_id,r,g,b,nir,ndvi,year,month,day_in_year,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,iceplant
0,238488.905223,3810842.0,epsg:26911,campus_lagoon,ca_m_3411934_sw_11_.6_20160713_20161004,2,42,42,46,117,0.471698,2016,7,195,0.0,1.0,0.0,1.0,0.444444,1
1,238232.338371,3810947.0,epsg:26911,campus_lagoon,ca_m_3411934_sw_11_.6_20160713_20161004,4,61,54,62,75,0.102941,2016,7,195,1.0,2.0,1.0,1.0,1.333333,1
2,238701.506127,3812155.0,epsg:26911,campus_lagoon,ca_m_3411934_sw_11_.6_20160713_20161004,22,65,67,58,164,0.432314,2016,7,195,0.0,1.0,0.0,1.0,0.111111,0
3,239203.786381,3811961.0,epsg:26911,campus_lagoon,ca_m_3411934_sw_11_.6_20160713_20161004,23,81,71,68,130,0.232227,2016,7,195,0.0,0.0,0.0,0.0,0.0,0


In [70]:
X_train = X_train.loc[:,'r':'day_in_year']#drop(['x','y','pts_crs','aoi','naip_id','polygon])
X_train = X_train.drop(['ndvi'], axis=1)
X_train

Unnamed: 0,r,g,b,nir,year,month,day_in_year
0,42,42,46,117,2016,7,195
1,61,54,62,75,2016,7,195
2,65,67,58,164,2016,7,195
3,81,71,68,130,2016,7,195
4,48,44,50,107,2016,7,195
...,...,...,...,...,...,...,...
92948,103,121,88,174,2014,6,152
92949,79,82,80,91,2014,6,152
92950,80,111,77,191,2014,6,152
92951,71,70,64,69,2014,6,152


In [8]:
spectral = X_train.loc[:,'r':'nir']
spectral

Unnamed: 0,r,g,b,nir
0,42,42,46,117
1,61,54,62,75
2,65,67,58,164
3,81,71,68,130
4,48,44,50,107
...,...,...,...,...
92948,103,121,88,174
92949,79,82,80,91
92950,80,111,77,191
92951,71,70,64,69


In [9]:
for y in years:
    X_train.loc[X_train.year==y, 'r':'nir'] = (X_train.loc[X_train.year==y,'r':'nir']  - all_means.sel(year=y))/all_sds.sel(year=y)
X_train = X_train.loc[:,'r':'nir']
X_train

Unnamed: 0,r,g,b,nir
0,-0.506887,-0.640088,-0.934050,0.976697
1,-0.036464,-0.295892,-0.345683,0.222662
2,0.062572,0.076986,-0.492775,1.820498
3,0.458718,0.191718,-0.125046,1.210089
4,-0.358332,-0.582722,-0.786958,0.797165
...,...,...,...,...
92948,0.248592,0.434472,-0.251770,1.741306
92949,-0.136236,-0.230880,-0.419724,0.503995
92950,-0.120202,0.263869,-0.482706,1.994731
92951,-0.264512,-0.435604,-0.755631,0.176033


In [64]:
# X_train['ndvi']= (X_train['nir']-X_train['r'])/(X_train['nir']-X_train['r'])
# X_train = X_train.drop('ndvi',axis=1)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit

import model_prep_and_evals as mpe

In [11]:
X_train = X_train.to_numpy()
spec_train = spectral.to_numpy()
y_train = y_train.to_numpy()

In [12]:
rfc_norm = ParallelPostFit(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 42))
rfc_norm.fit(X_train, y_train)

ParallelPostFit(estimator=RandomForestClassifier(random_state=42))

In [13]:
rfc_spec = ParallelPostFit(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 42))
rfc_spec.fit(spec_train, y_train)

ParallelPostFit(estimator=RandomForestClassifier(random_state=42))

In [14]:
X_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv'))

X_test = X_test.loc[X_test['aoi']=='campus_lagoon']
y_test = X_test.iceplant

In [15]:
test_spec = X_test.loc[:,'r':'nir']
test_spec

Unnamed: 0,r,g,b,nir
0,79,64,61,107
1,56,53,60,124
2,62,65,56,157
3,59,66,66,120
4,63,61,57,153
...,...,...,...,...
39835,81,84,69,93
39836,107,152,81,186
39837,55,63,52,69
39838,60,99,68,207


In [16]:
X_test = X_test.loc[:,'r':'day_in_year']
X_test = X_test.drop(['ndvi'], axis=1)
for y in years:
    X_test.loc[X_test.year==y, 'r':'nir'] = (X_test.loc[X_test.year==y,'r':'nir']  - all_means.sel(year=y))/all_sds.sel(year=y)
X_test = X_test.loc[:,'r':'nir']
X_test

Unnamed: 0,r,g,b,nir
0,0.409200,-0.009063,-0.382456,0.797165
1,-0.160259,-0.324575,-0.419229,1.102370
2,-0.011705,0.019620,-0.566321,1.694825
3,-0.085982,0.048303,-0.198592,1.030557
4,0.013054,-0.095111,-0.529548,1.623013
...,...,...,...,...
39835,-0.104167,-0.196759,-0.650660,0.533810
39836,0.312730,0.963342,-0.398730,1.920194
39837,-0.521064,-0.555026,-1.007561,0.176033
39838,-0.440892,0.059145,-0.671654,2.233249


In [17]:
preds_norm = rfc_norm.predict(X_test.to_numpy())
preds_spec  = rfc_spec.predict(test_spec.to_numpy())

In [18]:
mpe.print_accuracy_info(y_test,preds_norm)

true negatives: 30407     false positives: 622
false negatives: 695     true positives: 8116

sensitivity (TP/P): 92.11 %
specificity (TN/N): 98.0 %
G-mean:  0.95

precision (TP/(TP+FP)): 92.88 %

MCC:  0.9037691714487773

F1-measure:  0.92495
F0.5-measure (min false positives):  0.92727
F2-measure (min false negatives)  :  0.92265

accuracy: 96.69 %


In [19]:
mpe.print_accuracy_info(y_test,preds_spec)

true negatives: 30202     false positives: 827
false negatives: 710     true positives: 8101

sensitivity (TP/P): 91.94 %
specificity (TN/N): 97.33 %
G-mean:  0.95

precision (TP/(TP+FP)): 90.74 %

MCC:  0.8885741906019373

F1-measure:  0.91335
F0.5-measure (min false positives):  0.90975
F2-measure (min false negatives)  :  0.91698

accuracy: 96.14 %


In [20]:
import raster_to_features as rm

In [22]:
itemid = 'ca_m_3411934_sw_11_1_20140601_20141030'
raster = rm.rioxr_from_itemid(itemid)

is_veg, water_index, not_veg_index = rm.add_spectral_features(df = rm.raster_as_df(raster.to_numpy(), 
                                                                                   ['r','g','b','nir']), #names of bands
                                                           ndwi_thresh = 0.3, 
                                                           ndvi_thresh = 0.05)
is_veg = is_veg.drop(['ndwi','ndvi'],axis=1)
is_veg

Unnamed: 0,r,g,b,nir
5,146,141,107,172
9,137,136,104,159
10,146,141,109,173
157,148,146,111,178
158,121,122,98,161
...,...,...,...,...
50344155,6,24,44,13
50346569,5,12,42,9
50347257,23,27,54,35
50351210,38,66,74,44


In [23]:
is_veg_norm = (is_veg - all_means.sel(year=2014))/all_sds.sel(year=2014)
is_veg_norm

Unnamed: 0,r,g,b,nir
5,0.938076,0.775678,0.147120,1.711491
9,0.793765,0.690377,0.084137,1.517695
10,0.938076,0.775678,0.189108,1.726398
157,0.970145,0.860980,0.231096,1.800935
158,0.537213,0.451532,-0.041828,1.547510
...,...,...,...,...
50344155,-1.306755,-1.220378,-1.175515,-0.658779
50346569,-1.322790,-1.425102,-1.217503,-0.718408
50347257,-1.034168,-1.169197,-0.965573,-0.330817
50351210,-0.793651,-0.503845,-0.545689,-0.196651


In [25]:
preds_norm = rfc_norm.predict(is_veg_norm.to_numpy())

In [26]:
preds_spec = rfc_spec.predict(is_veg.to_numpy())

In [91]:
# # ---------------------------------------
# # recover pixel indices for iceplant classifications
# preds_df = pd.DataFrame(preds, 
#                      columns=['is_iceplant'], 
#                      index = is_veg.index)
# is_iceplant_index = preds_df[preds_df.is_iceplant == 1].index.to_numpy()
# non_iceplant_index = preds_df[preds_df.is_iceplant == 0].index.to_numpy()

# # ---------------------------------------
# # reconstruct indices into image
# indices = [non_iceplant_index,
#            is_iceplant_index, 
#            not_veg_index,
#            water_index]
# values = [0,    # values assigned to pixels from each index
#           1,
#           2,
#           3]
# reconstruct = rm.indices_to_image(raster.shape[1], raster.shape[2], indices, values, back_value=100)

In [27]:
def reconstruct(preds, is_veg, raster):
    preds_df = pd.DataFrame(preds, 
                     columns=['is_iceplant'], 
                     index = is_veg.index)
    is_iceplant_index = preds_df[preds_df.is_iceplant == 1].index.to_numpy()
    non_iceplant_index = preds_df[preds_df.is_iceplant == 0].index.to_numpy()

    # ---------------------------------------
    # reconstruct indices into image
    indices = [non_iceplant_index,
               is_iceplant_index, 
               not_veg_index,
               water_index]
    values = [0,    # values assigned to pixels from each index
              1,
              2,
              3]
    reconstruct = rm.indices_to_image(raster.shape[1], raster.shape[2], indices, values, back_value=100)
    
    return reconstruct

In [28]:
rec_norm = reconstruct(preds_norm, is_veg_norm, raster)

In [29]:
rec_spec = reconstruct(preds_spec, is_veg, raster)

In [31]:
import rasterio

In [32]:
filename = 'trial_spec.tif'
# ---------------------------------------
# save raster 
with rasterio.open(
    os.path.join(os.getcwd(),'temp',filename),  # file path
    'w',           # w = write
    driver = 'GTiff', # format
    height = rec_spec.shape[0], 
    width = rec_spec.shape[1],
    count = 1,  # number of raster bands in the dataset
    dtype = rasterio.uint8,
    crs = raster.rio.crs,
    transform = raster.rio.transform(),
) as dst:
    dst.write(rec_spec.astype(rasterio.uint8), 1)
# ---------------------------------------

In [None]:
  filename = 'trial_norm.tif'
# ---------------------------------------
# save raster 
with rasterio.open(
    os.path.join(os.getcwd(),'temp',filename),  # file path
    'w',           # w = write
    driver = 'GTiff', # format
    height = rec_norm.shape[0], 
    width = rec_norm.shape[1],
    count = 1,  # number of raster bands in the dataset
    dtype = rasterio.uint8,
    crs = raster.rio.crs,
    transform = raster.rio.transform(),
) as dst:
    dst.write(rec_norm.astype(rasterio.uint8), 1)
# ---------------------------------------