This notebook can be used to add features to a csv or shapefile that has the following information at every point:
x-coordinate (longitude), y-coordinate (latitude), itemid.

Iceplant classification can be added from a column present in the dataset, set as a constant value, or not included.

In particular, this can be used to add features to files coming fro Collect Earth Online classifications. 

All points must come from a single year.

In [2]:
import os
import pandas as pd
import geopandas as gpd
import planetary_computer as pc
import rioxarray as rioxr

import rasterio as rio
from rasterio.crs import CRS

from shapely.geometry import box

import sample_rasters as sr

In [3]:
# file path to csv or shp with points to add spectral info

folder = '/home/jovyan/msai4earth-esa/iceplant_detection/models/modelAE5_FP_2020/validation/'
file_name = 'AE5FP_validation_pts_map_ref_classes.csv'
fp = folder+file_name
pd.read_csv(fp).head()

Unnamed: 0,lon,lat,naip_id,pts_crs,map_class,ref_class
0,-120.465232,34.462486,ca_m_3412037_nw_10_060_20200607,EPSG:4326,1,1
1,-119.997217,34.459656,ca_m_3412040_ne_10_060_20200522,EPSG:4326,1,1
2,-119.96981,34.438493,ca_m_3411933_nw_11_060_20200522,EPSG:4326,3,3
3,-120.469599,34.465779,ca_m_3412037_nw_10_060_20200607,EPSG:4326,1,1
4,-120.266262,34.471258,ca_m_3412038_ne_10_060_20200522,EPSG:4326,2,2


In [4]:
# indicate if file is a csv or shapefile
csv = True
shp = False

# -------------------------------------------
# if file is a csv indicate these:
crs = CRS.from_epsg(4326)

# column names for longitude and latitude
lon_label = 'lon' # lon = x
lat_label = 'lat' # lat = y

# -------------------------------------------
# name of column containing itemid of the NAIP scene containing the point if there is one
itemid_col = 'naip_id'

# -------------------------------------------
# one of 'ignore', 'included', 'set_to_constant'
iceplant_param = 'ignore'

# if iceplant_param = 'included' indicate which column should be used as iceplant classification column
# name of column indicating whether the point is iceplant (1) or other vegetation (0)
iceplant_col = 'iceplant'

# if iceplant_param = 'set_to_constant', indicate the constant value for all points
iceplant_val = 0

# -------------------------------------------
# all pts need to come from same year: used to look for NAIP scenes
year = 2020

# -------------------------------------------
# radius of the disk over which entropy is calculated
entropy_r = 6

# length of side of the square window over which averaged is calculated.
box_side = entropy_r *2 +1

# -------------------------------------------
# name of output csv with added features to points
# saved in folder 
save = True
csv_name = 'features13_'+file_name

In [5]:
# ---------------------------------------------------
# temporary folder for aux rasters
folp = os.path.join(os.getcwd(),'temp','aux_naip_rasters')
if os.path.exists(folp) == False:
    os.mkdir(folp)

# ---------------------------------------------------
# open validation points as geodataframe
if csv:
    pts = sr.geodataframe_from_csv(fp = fp, 
                                   lon_label = lon_label, 
                                   lat_label = lat_label, 
                                   crs = crs)
elif shp:
    pts = gpd.read_file(fp)
    
scene_ids = list(pts[itemid_col].unique())    
pts.head(3)

Unnamed: 0,lon,lat,naip_id,pts_crs,map_class,ref_class,geometry
0,-120.465232,34.462486,ca_m_3412037_nw_10_060_20200607,EPSG:4326,1,1,POINT (-120.46523 34.46249)
1,-119.997217,34.459656,ca_m_3412040_ne_10_060_20200522,EPSG:4326,1,1,POINT (-119.99722 34.45966)
2,-119.96981,34.438493,ca_m_3411933_nw_11_060_20200522,EPSG:4326,3,3,POINT (-119.96981 34.43849)


In [7]:
sampled_points = []
N = len(scene_ids)  # counter to finish

for itemid in scene_ids:

    to_sample = pts[pts[itemid_col] == itemid].geometry
    
    if len(to_sample) > 0:
        # ---------------------------------------------------        
        # sample spectral bands from NAIP
        item = sr.get_item_from_id(itemid)
        scene_rast_r = sr.get_raster_from_item(item)     

        band_names = ['r', 'g', 'b', 'nir']
        spectral_bands = sr.sample_raster_from_pts(to_sample, scene_rast_r, band_names).set_index(to_sample.index)

        # ---------------------------------------------------        
        # sample max, min, avg and entrs from NAIP
        to_sample_match = to_sample.to_crs(scene_rast_r.crs)
        scene_rast = rioxr.open_rasterio(pc.sign(item.assets["image"].href)) 

        band_names.append('ndvi')
#        tags = ['_max', '_min', '_avg', '_entr']
        tags = ['_avg', '_entr']
        window_cols = [band+tag for band in band_names for tag in tags]        

        window_features = []
        for i in range(len(to_sample_match)):
            pt = to_sample_match.iloc[[i]]

            # clip scene to box around point
            reduce_box = box(*(pt.iloc[0].buffer(6).bounds)) 
            rast = scene_rast.rio.clip_box(*reduce_box.bounds)

            # save auxiliary rasters for R,G,B,NIR: max,min,avg,entr
            for i in range(4):
                sr.max_min_avg_rasters(raster=rast, band=i+1, rast_name=band_names[i], n=box_side, folder_path=folp)
                sr.entropy_raster(raster=rast, band=i+1, rast_name=band_names[i], n=entropy_r, folder_path=folp)
            # ------------------------------
            # make auxiliary NDVI of clipped scene
            ndvi = sr.ndvi_xarray(rast)

            # save auxiliary NDVI rasters: max,min,avg
            # sr.max_min_avg_rasters(rast_data=ndvi, 
            #                        crs=rast.rio.crs, 
            #                        transf=rast.rio.transform(), 
            #                        rast_name=band_names[4], 
            #                        n=box_side, 
            #                        folder_path=folp)
            sr.avg_raster(rast_data=ndvi, 
                                   crs=rast.rio.crs, 
                                   transf=rast.rio.transform(), 
                                   rast_name=band_names[4], 
                                   n=box_side, 
                                   folder_path=folp)

            # adjust ndvi to entropy input types
            ndvi = ndvi*100 +100
            sr.entropy_raster(rast_data=ndvi.astype('uint8'), 
                              crs=rast.rio.crs, transf=rast.rio.transform(), 
                              rast_name=band_names[4], 
                              n=entropy_r, 
                              folder_path=folp)
            # ---------------------------------------
            # sample raster values for current point
            samples = []
            for col_name in window_cols:
                fp = os.path.join(folp, col_name+'s.tif')
                aux_rast_r = rio.open(fp)
                sample = sr.sample_raster_from_pts(pt, aux_rast_r, [col_name])    
                os.remove(fp)
                samples.append(sample)       

            # ---------------------------------------
            # Add all derived spectral data to pts dataframe
            window_features.append(pd.concat(samples, axis = 1)) 
        # ---------------------------------------------------                    
        # ---------------------------------------------------
        # concatenate sampled data
        window_features = pd.concat(window_features).set_index(to_sample.index)
        #lidar_bands = pd.concat(canopy_h_samples, axis=1).set_index(to_sample.index)
        df = pd.concat([to_sample, spectral_bands, window_features], axis=1)
        # ---------------------------------------------------
        # add date and naipid information
        kwargs = {'year' : item.datetime.year,
                  'month' : item.datetime.month,
                  'day_in_year' : sr.day_in_year(item.datetime.day, item.datetime.month, item.datetime.year),
                  'naip_id' : itemid}
        df = df.assign(**kwargs)
        # ---------------------------------------------------
        sampled_points.append(gpd.GeoDataFrame(df))
        
    # ---------------------------------------
    # processing message
    N = N-1                
    print('REMAINING: ', N, 'scenes', end="\r")        


samples = pd.concat(sampled_points).sort_index()

# ---------------------------------------
# create iceplant column
if iceplant_param == 'set_to_constant':
    samples['iceplant'] = iceplant_val
elif iceplant_param == 'included':
    samples['iceplant'] = pts.iceplant

# ---------------------------------------
# create max-min difference columns
# for band in band_names:
#     col_name = band + '_diff'
#     samples[col_name] = samples[band +'_max'] - samples[band +'_min']

# ---------------------------------------
samples = samples.assign( x = lambda df : df.geometry.x, 
                          y = lambda df : df.geometry.y,
                         pts_crs = crs.to_string())

# ---------------------------------------
# create ndvi column
# make this with assign
samples['ndvi'] = (samples.nir.astype('int16') - samples.r.astype('int16'))/(samples.nir.astype('int16') + samples.r.astype('int16'))

print('FINISHED PROCESSING')       

FINISHED PROCESSINGss


In [8]:
for band in band_names:
    samples = samples.rename({band+'_avg': band+'_avg'+str(box_side), 
                              band+'_entr': band+'_entr'+str(box_side)}, 
                             axis=1)

In [10]:
samples.columns

Index(['geometry', 'r', 'g', 'b', 'nir', 'r_avg13', 'r_entr13', 'g_avg13',
       'g_entr13', 'b_avg13', 'b_entr13', 'nir_avg13', 'nir_entr13',
       'ndvi_avg13', 'ndvi_entr13', 'year', 'month', 'day_in_year', 'naip_id',
       'x', 'y', 'pts_crs', 'ndvi'],
      dtype='object')

In [12]:
features = ['x', 'y', 'pts_crs',
                   'naip_id',
                    'r', 
                    #'r_max', 'r_min', 'r_diff', 
                    'r_avg'+str(box_side), 'r_entr'+str(box_side), # spectral
                    'g', 
                    #'g_max', 'g_min', 'g_diff', 
                    'g_avg'+str(box_side), 'g_entr'+str(box_side),
                    'b', 
                    #'b_max', 'b_min', 'b_diff', 
                    'b_avg'+str(box_side), 'b_entr'+str(box_side),
                    'nir', 
                    #'nir_max', 'nir_min', 'nir_diff', 
                    'nir_avg'+str(box_side), 'nir_entr'+str(box_side),
                    'ndvi', 
                    #'ndvi_max', 'ndvi_min', 'ndvi_diff', 
                    'ndvi_avg'+str(box_side), 'ndvi_entr'+str(box_side),                     
                    'year', 'month', 'day_in_year',
                    'iceplant']

if iceplant_param == 'ignore':
    features.remove('iceplant')

samples = samples[features]
samples

Unnamed: 0,x,y,pts_crs,naip_id,r,r_avg13,r_entr13,g,g_avg13,g_entr13,...,b_entr13,nir,nir_avg13,nir_entr13,ndvi,ndvi_avg13,ndvi_entr13,year,month,day_in_year
0,-120.465232,34.462486,EPSG:4326,ca_m_3412037_nw_10_060_20200607,122,119.798813,4.680873,98,97.094673,4.772045,...,4.494718,146,144.213013,5.074388,0.089552,0.088757,4.223701,2020,6,159
1,-119.997217,34.459656,EPSG:4326,ca_m_3412040_ne_10_060_20200522,115,122.355026,4.328408,123,122.633133,4.227055,...,3.788774,178,168.502960,4.115915,0.215017,0.153846,3.714357,2020,5,143
2,-119.969810,34.438493,EPSG:4326,ca_m_3411933_nw_11_060_20200522,27,28.023668,1.903023,57,57.449703,1.778224,...,1.627375,11,10.928994,1.570813,-0.421053,-0.437870,3.306622,2020,5,143
3,-120.469599,34.465779,EPSG:4326,ca_m_3412037_nw_10_060_20200607,98,102.775146,4.361212,89,89.224854,4.382992,...,4.223971,163,164.763321,3.877177,0.249042,0.230769,3.802276,2020,6,159
4,-120.266262,34.471258,EPSG:4326,ca_m_3412038_ne_10_060_20200522,180,180.266266,3.718565,167,168.443787,4.203931,...,4.661599,168,167.491119,3.790279,-0.034483,-0.035503,1.731678,2020,5,143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589,-119.864898,34.410694,EPSG:4326,ca_m_3411934_sw_11_060_20200521,66,81.692307,5.669391,90,96.414200,5.581866,...,5.288260,145,103.639053,5.621064,0.374408,0.100592,5.258715,2020,5,142
590,-120.472786,34.472076,EPSG:4326,ca_m_3412037_nw_10_060_20200607,98,107.017754,4.876243,75,84.668640,4.492466,...,4.317893,151,150.538467,4.574808,0.212851,0.165680,4.213130,2020,6,159
591,-120.463615,34.450510,EPSG:4326,ca_m_3412037_nw_10_060_20200607,78,73.443787,4.034180,89,85.366867,3.397365,...,2.883639,177,168.875732,4.008023,0.388235,0.390533,3.202714,2020,6,159
592,-119.798882,34.422402,EPSG:4326,ca_m_3411934_se_11_060_20200521,87,107.497040,5.606616,90,102.792900,5.006817,...,4.672184,142,157.644974,4.926486,0.240175,0.189349,4.282148,2020,5,142


In [14]:
samples['map_class'] = pts.map_class
samples['ref_class'] = pts.ref_class

In [15]:
samples.columns

Index(['x', 'y', 'pts_crs', 'naip_id', 'r', 'r_avg13', 'r_entr13', 'g',
       'g_avg13', 'g_entr13', 'b', 'b_avg13', 'b_entr13', 'nir', 'nir_avg13',
       'nir_entr13', 'ndvi', 'ndvi_avg13', 'ndvi_entr13', 'year', 'month',
       'day_in_year', 'map_class', 'ref_class'],
      dtype='object')

In [16]:
# you can add/drop more columns here

# samples['LSWE_class'] = pts.map_class
# samples['ref_class'] = pts.ref_class
# samples = samples.drop(['iceplant'], axis=1)

In [17]:
if save:
    samples.to_csv(os.path.join(os.getcwd(), folder, csv_name), index=False) 