This notebook can be used to add afeatures to a csv or shapefile that has the following information at every point:
x-coordinate (longitude), y-coordinate (latitude), itemid.

Iceplant classification can be added from a column present in the dataset, set as a constant value, or not included.

In particular, this can be used to add features to files coming fro Collect Earth Online classifications. 

All points must come from a single year.

In [1]:
import os
import pandas as pd
import geopandas as gpd
import planetary_computer as pc
import rioxarray as rioxr

import rasterio as rio
from rasterio.crs import CRS

from shapely.geometry import box

import sample_rasters as sr

In [2]:
# file path to csv or shp with points to add spectral info
folder = '/home/jovyan/msai4earth-esa/iceplant_detection/models/modelAE5_FP_2020/false_negatives/'
file_name = 'iceplant_false_negatives_AE5_FP.csv'
fp = folder+file_name
pd.read_csv(fp).head()

Unnamed: 0,id,aoi,x,y
0,ca_m_3412037_nw_10_060_20200607,point_conception,-120.496198,34.497
1,ca_m_3412037_nw_10_060_20200607,point_conception,-120.491921,34.493616
2,ca_m_3412037_nw_10_060_20200607,point_conception,-120.491217,34.492771
3,ca_m_3412037_nw_10_060_20200607,point_conception,-120.489627,34.489312
4,ca_m_3412037_nw_10_060_20200607,point_conception,-120.488552,34.487601


In [3]:
# indicate if file is a csv or shapefile
csv = True
shp = False

# -------------------------------------------
# if file is a csv indicate these:
crs = CRS.from_epsg(4326)

# column names for longitude and latitude
lon_label = 'x' 
lat_label = 'y'

# -------------------------------------------
# name of column containing itemid of the NAIP scene containing the point if there is one
itemid_col = 'id'

# -------------------------------------------
# one of 'ignore', 'included', 'set_to_constant'
iceplant_param = 'set_to_constant'

# if iceplant_param = 'included' indicate which column should be used as iceplant classification column
# name of column indicating whether the point is iceplant (1) or other vegetation (0)
iceplant_col = 'iceplant'

# if iceplant_param = 'set_to_constant', indicate the constant value for all points
iceplant_val = 1

# -------------------------------------------
# all pts need to come from same year: used to look for NAIP scenes
year = 2020

# -------------------------------------------
# radius of the disk over which entropy is calculated
entropy_r = 5

# length of side of the square window over which averaged is calculated.
box_side = entropy_r *2 +1

# -------------------------------------------
# name of output csv with added features to points
# saved in folder 
save = True
csv_name = 'features_false_negatives_AE5_FP.csv'

In [4]:
# ---------------------------------------------------
# temporary folder for aux rasters
folp = os.path.join(os.getcwd(),'temp','aux_naip_rasters')
if os.path.exists(folp) == False:
    os.mkdir(folp)

# ---------------------------------------------------
# open validation points as geodataframe
if csv:
    pts = sr.geodataframe_from_csv(fp = fp, 
                                   lon_label = lon_label, 
                                   lat_label = lat_label, 
                                   crs = crs)
elif shp:
    pts = gpd.read_file(fp)
    
scene_ids = list(pts[itemid_col].unique())    
pts.head(3)

Unnamed: 0,id,aoi,x,y,geometry
0,ca_m_3412037_nw_10_060_20200607,point_conception,-120.496198,34.497,POINT (-120.49620 34.49700)
1,ca_m_3412037_nw_10_060_20200607,point_conception,-120.491921,34.493616,POINT (-120.49192 34.49362)
2,ca_m_3412037_nw_10_060_20200607,point_conception,-120.491217,34.492771,POINT (-120.49122 34.49277)


In [5]:

sampled_points = []
N = len(scene_ids)  # counter to finish

for itemid in scene_ids:

    to_sample = pts[pts[itemid_col] == itemid].geometry
    
    if len(to_sample) > 0:
        # ---------------------------------------------------        
        # sample spectral bands from NAIP
        item = sr.get_item_from_id(itemid)
        scene_rast_r = sr.get_raster_from_item(item)     

        band_names = ['r', 'g', 'b', 'nir']
        spectral_bands = sr.sample_raster_from_pts(to_sample, scene_rast_r, band_names).set_index(to_sample.index)

        # ---------------------------------------------------        
        # sample max, min, avg and entrs from NAIP
        to_sample_match = to_sample.to_crs(scene_rast_r.crs)
        scene_rast = rioxr.open_rasterio(pc.sign(item.assets["image"].href)) 

        band_names.append('ndvi')
        tags = ['_max', '_min', '_avg', '_entr']
        window_cols = [band+tag for band in band_names for tag in tags]        

        window_features = []
        for i in range(len(to_sample_match)):
            pt = to_sample_match.iloc[[i]]

            # clip scene to box around point
            reduce_box = box(*(pt.iloc[0].buffer(6).bounds)) 
            rast = scene_rast.rio.clip_box(*reduce_box.bounds)

            # save auxiliary rasters for R,G,B,NIR: max,min,avg,entr
            for i in range(4):
                sr.max_min_avg_rasters(raster=rast, band=i+1, rast_name=band_names[i], n=box_side, folder_path=folp)
                sr.entropy_raster(raster=rast, band=i+1, rast_name=band_names[i], n=entropy_r, folder_path=folp)
            # ------------------------------
            # make auxiliary NDVI of clipped scene
            ndvi = sr.ndvi_xarray(rast)

            # save auxiliary NDVI rasters: max,min,avg
            sr.max_min_avg_rasters(rast_data=ndvi, 
                                   crs=rast.rio.crs, 
                                   transf=rast.rio.transform(), 
                                   rast_name=band_names[4], 
                                   n=box_side, 
                                   folder_path=folp)

            # adjust ndvi to entropy input types
            ndvi = ndvi*100 +100
            sr.entropy_raster(rast_data=ndvi.astype('uint8'), 
                              crs=rast.rio.crs, transf=rast.rio.transform(), 
                              rast_name=band_names[4], 
                              n=entropy_r, 
                              folder_path=folp)
            # ---------------------------------------
            # sample raster values for current point
            samples = []
            for col_name in window_cols:
                fp = os.path.join(folp, col_name+'s.tif')
                aux_rast_r = rio.open(fp)
                sample = sr.sample_raster_from_pts(pt, aux_rast_r, [col_name])    
                os.remove(fp)
                samples.append(sample)       

            # ---------------------------------------
            # Add all derived spectral data to pts dataframe
            window_features.append(pd.concat(samples, axis = 1)) 
        # ---------------------------------------------------                    
        # ---------------------------------------------------
        # concatenate sampled data
        window_features = pd.concat(window_features).set_index(to_sample.index)
        #lidar_bands = pd.concat(canopy_h_samples, axis=1).set_index(to_sample.index)
        df = pd.concat([to_sample, spectral_bands, window_features], axis=1)
        # ---------------------------------------------------
        # add date and naipid information
        kwargs = {'year' : item.datetime.year,
                  'month' : item.datetime.month,
                  'day_in_year' : sr.day_in_year(item.datetime.day, item.datetime.month, item.datetime.year),
                  'naip_id' : itemid}
        df = df.assign(**kwargs)
        # ---------------------------------------------------
        sampled_points.append(gpd.GeoDataFrame(df))
        
    # ---------------------------------------
    # processing message
    N = N-1                
    print('REMAINING: ', N, 'scenes', end="\r")        


samples = pd.concat(sampled_points).sort_index()

# ---------------------------------------
# create iceplant column
if iceplant_param == 'set_to_constant':
    samples['iceplant'] = iceplant_val
elif iceplant_param == 'included':
    samples['iceplant'] = pts.iceplant

# ---------------------------------------
# create max-min difference columns
for band in band_names:
    col_name = band + '_diff'
    samples[col_name] = samples[band +'_max'] - samples[band +'_min']

# ---------------------------------------
samples = samples.assign( x = lambda df : df.geometry.x, 
                          y = lambda df : df.geometry.y,
                         pts_crs = crs.to_string())

# ---------------------------------------
# create ndvi column
# make this with assign
samples['ndvi'] = (samples.nir.astype('int16') - samples.r.astype('int16'))/(samples.nir.astype('int16') + samples.r.astype('int16'))

print('FINISHED PROCESSING')       

FINISHED PROCESSINGs


In [6]:
for band in band_names:
    samples = samples.rename({band+'_avg': band+'_avg'+str(entropy_r), 
                              band+'_entr': band+'_entr'+str(entropy_r)}, 
                             axis=1)

In [7]:
samples.columns

Index(['geometry', 'r', 'g', 'b', 'nir', 'r_max', 'r_min', 'r_avg5', 'r_entr5',
       'g_max', 'g_min', 'g_avg5', 'g_entr5', 'b_max', 'b_min', 'b_avg5',
       'b_entr5', 'nir_max', 'nir_min', 'nir_avg5', 'nir_entr5', 'ndvi_max',
       'ndvi_min', 'ndvi_avg5', 'ndvi_entr5', 'year', 'month', 'day_in_year',
       'naip_id', 'iceplant', 'r_diff', 'g_diff', 'b_diff', 'nir_diff',
       'ndvi_diff', 'x', 'y', 'pts_crs', 'ndvi'],
      dtype='object')

In [8]:
features = ['x', 'y', 'pts_crs',
                   'naip_id',
                    'r', 'r_max', 'r_min', 'r_diff', 'r_avg'+str(entropy_r), 'r_entr'+str(entropy_r), # spectral
                    'g', 'g_max', 'g_min', 'g_diff', 'g_avg'+str(entropy_r), 'g_entr'+str(entropy_r),
                    'b', 'b_max', 'b_min', 'b_diff', 'b_avg'+str(entropy_r), 'b_entr'+str(entropy_r),
                    'nir', 'nir_max', 'nir_min', 'nir_diff', 'nir_avg'+str(entropy_r), 'nir_entr'+str(entropy_r),
                    'ndvi', 'ndvi_max', 'ndvi_min', 'ndvi_diff', 'ndvi_avg'+str(entropy_r), 'ndvi_entr'+str(entropy_r),                     
                    'year', 'month', 'day_in_year',
                    'iceplant']

if iceplant_param == 'ignore':
    features.remove('iceplant')

samples = samples[features]
samples

Unnamed: 0,x,y,pts_crs,naip_id,r,r_max,r_min,r_diff,r_avg5,r_entr5,...,ndvi,ndvi_max,ndvi_min,ndvi_diff,ndvi_avg5,ndvi_entr5,year,month,day_in_year,iceplant
0,-120.496198,34.497000,EPSG:4326,ca_m_3412037_nw_10_060_20200607,103,180,79,101,110.768593,5.163304,...,0.185771,0.286245,-0.125000,0.411245,0.090909,4.972627,2020,6,159,1
1,-120.491921,34.493616,EPSG:4326,ca_m_3412037_nw_10_060_20200607,105,173,73,100,118.652893,5.666648,...,0.236364,0.411290,-0.077882,0.489172,0.157025,5.059720,2020,6,159,1
2,-120.491217,34.492771,EPSG:4326,ca_m_3412037_nw_10_060_20200607,111,147,78,69,109.669418,4.958203,...,0.198556,0.324675,-0.086758,0.411433,0.132231,4.394407,2020,6,159,1
3,-120.489627,34.489312,EPSG:4326,ca_m_3412037_nw_10_060_20200607,123,183,104,79,141.016525,5.518500,...,0.168919,0.229091,-0.097872,0.326963,0.041322,4.634749,2020,6,159,1
4,-120.488552,34.487601,EPSG:4326,ca_m_3412037_nw_10_060_20200607,117,196,111,85,161.537186,5.507948,...,0.204082,0.229167,-0.060109,0.289276,0.041322,4.530648,2020,6,159,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,-119.519484,34.390461,EPSG:4326,ca_m_3411936_se_11_060_20200521,102,161,67,94,117.363640,5.286959,...,0.233083,0.436975,-0.064846,0.501821,0.165289,4.705237,2020,5,142,1
508,-119.520288,34.391000,EPSG:4326,ca_m_3411936_se_11_060_20200521,92,167,64,103,92.462807,5.509181,...,0.315985,0.492063,0.017647,0.474416,0.314050,4.910855,2020,5,142,1
509,-119.520538,34.390967,EPSG:4326,ca_m_3411936_se_11_060_20200521,108,193,91,102,142.735535,5.463066,...,0.197026,0.280899,-0.045198,0.326097,0.090909,4.788631,2020,5,142,1
510,-119.520184,34.390843,EPSG:4326,ca_m_3411936_se_11_060_20200521,82,181,73,108,128.710739,5.449245,...,0.366795,0.421875,-0.042683,0.464558,0.140496,4.912406,2020,5,142,1


In [9]:
samples.columns

Index(['x', 'y', 'pts_crs', 'naip_id', 'r', 'r_max', 'r_min', 'r_diff',
       'r_avg5', 'r_entr5', 'g', 'g_max', 'g_min', 'g_diff', 'g_avg5',
       'g_entr5', 'b', 'b_max', 'b_min', 'b_diff', 'b_avg5', 'b_entr5', 'nir',
       'nir_max', 'nir_min', 'nir_diff', 'nir_avg5', 'nir_entr5', 'ndvi',
       'ndvi_max', 'ndvi_min', 'ndvi_diff', 'ndvi_avg5', 'ndvi_entr5', 'year',
       'month', 'day_in_year', 'iceplant'],
      dtype='object')

In [10]:
# you can add/drop more columns here

# samples['LSWE_class'] = pts.map_class
# samples['ref_class'] = pts.ref_class
# samples = samples.drop(['iceplant'], axis=1)

In [11]:
if save:
    samples.to_csv(os.path.join(os.getcwd(), folder, csv_name), index=False) 