This notebook can be used to add features to a csv or shapefile that has the following information at every point:
x-coordinate (longitude), y-coordinate (latitude), itemid.

Iceplant classification can be added from a column present in the dataset, set as a constant value, or not included.

In particular, this can be used to add features to files coming fro Collect Earth Online classifications. 

All points must come from a single year.

In [1]:
import os
import pandas as pd
import geopandas as gpd
import planetary_computer as pc
import rioxarray as rioxr

import rasterio as rio
from rasterio.crs import CRS

from shapely.geometry import box

import sample_rasters as sr

In [2]:
# file path to csv or shp with points to add spectral info
folder = '/home/jovyan/msai4earth-esa/iceplant_detection/models/model_pepper/'
file_name = 'pepper13_false_positives.csv'
fp = folder+file_name
pd.read_csv(fp).head()

Unnamed: 0,lon,lat,naip_id,aoi,crs
0,-120.251567,34.472316,ca_m_3412039_nw_10_060_20200522,gaviota,EPSG:4326
1,-120.251699,34.472494,ca_m_3412039_nw_10_060_20200522,gaviota,EPSG:4326
2,-120.252003,34.472158,ca_m_3412039_nw_10_060_20200522,gaviota,EPSG:4326
3,-120.25121,34.476551,ca_m_3412039_nw_10_060_20200522,gaviota,EPSG:4326
4,-120.251851,34.476266,ca_m_3412039_nw_10_060_20200522,gaviota,EPSG:4326


In [3]:
# indicate if file is a csv or shapefile
csv = True
shp = False

# -------------------------------------------
# if file is a csv indicate these:
crs = CRS.from_epsg(4326)

# column names for longitude and latitude
lon_label = 'lon' # lon = x
lat_label = 'lat' # lat = y

# -------------------------------------------
# name of column containing itemid of the NAIP scene containing the point if there is one
itemid_col = 'naip_id'

# -------------------------------------------
# one of 'ignore', 'included', 'set_to_constant'
iceplant_param = 'set_to_constant'

# if iceplant_param = 'included' indicate which column should be used as iceplant classification column
# name of column indicating whether the point is iceplant (1) or other vegetation (0)
iceplant_col = 'iceplant'

# if iceplant_param = 'set_to_constant', indicate the constant value for all points
iceplant_val = 0

# -------------------------------------------
# all pts need to come from same year: used to look for NAIP scenes
year = 2020

# -------------------------------------------
# radius of the disk over which entropy is calculated
entropy_r = 6

# length of side of the square window over which averaged is calculated.
box_side = entropy_r *2 +1

# -------------------------------------------
# name of output csv with added features to points
# saved in folder 
save = True
csv_name = 'features_pepper13_false_positives.csv'

In [4]:
# ---------------------------------------------------
# temporary folder for aux rasters
folp = os.path.join(os.getcwd(),'temp','aux_naip_rasters')
if os.path.exists(folp) == False:
    os.mkdir(folp)

# ---------------------------------------------------
# open validation points as geodataframe
if csv:
    pts = sr.geodataframe_from_csv(fp = fp, 
                                   lon_label = lon_label, 
                                   lat_label = lat_label, 
                                   crs = crs)
elif shp:
    pts = gpd.read_file(fp)
    
scene_ids = list(pts[itemid_col].unique())    
pts.head(3)

Unnamed: 0,lon,lat,naip_id,aoi,crs,geometry
0,-120.251567,34.472316,ca_m_3412039_nw_10_060_20200522,gaviota,EPSG:4326,POINT (-120.25157 34.47232)
1,-120.251699,34.472494,ca_m_3412039_nw_10_060_20200522,gaviota,EPSG:4326,POINT (-120.25170 34.47249)
2,-120.252003,34.472158,ca_m_3412039_nw_10_060_20200522,gaviota,EPSG:4326,POINT (-120.25200 34.47216)


In [5]:
sampled_points = []
N = len(scene_ids)  # counter to finish

for itemid in scene_ids:

    to_sample = pts[pts[itemid_col] == itemid].geometry
    
    if len(to_sample) > 0:
        # ---------------------------------------------------        
        # sample spectral bands from NAIP
        item = sr.get_item_from_id(itemid)
        scene_rast_r = sr.get_raster_from_item(item)     

        band_names = ['r', 'g', 'b', 'nir']
        spectral_bands = sr.sample_raster_from_pts(to_sample, scene_rast_r, band_names).set_index(to_sample.index)

        # ---------------------------------------------------        
        # sample max, min, avg and entrs from NAIP
        to_sample_match = to_sample.to_crs(scene_rast_r.crs)
        scene_rast = rioxr.open_rasterio(pc.sign(item.assets["image"].href)) 

        band_names.append('ndvi')
#        tags = ['_max', '_min', '_avg', '_entr']
        tags = ['_avg', '_entr']
        window_cols = [band+tag for band in band_names for tag in tags]        

        window_features = []
        for i in range(len(to_sample_match)):
            pt = to_sample_match.iloc[[i]]

            # clip scene to box around point
            reduce_box = box(*(pt.iloc[0].buffer(6).bounds)) 
            rast = scene_rast.rio.clip_box(*reduce_box.bounds)

            # save auxiliary rasters for R,G,B,NIR: max,min,avg,entr
            for i in range(4):
                sr.max_min_avg_rasters(raster=rast, band=i+1, rast_name=band_names[i], n=box_side, folder_path=folp)
                sr.entropy_raster(raster=rast, band=i+1, rast_name=band_names[i], n=entropy_r, folder_path=folp)
            # ------------------------------
            # make auxiliary NDVI of clipped scene
            ndvi = sr.ndvi_xarray(rast)

            # save auxiliary NDVI rasters: max,min,avg
            # sr.max_min_avg_rasters(rast_data=ndvi, 
            #                        crs=rast.rio.crs, 
            #                        transf=rast.rio.transform(), 
            #                        rast_name=band_names[4], 
            #                        n=box_side, 
            #                        folder_path=folp)
            sr.avg_raster(rast_data=ndvi, 
                                   crs=rast.rio.crs, 
                                   transf=rast.rio.transform(), 
                                   rast_name=band_names[4], 
                                   n=box_side, 
                                   folder_path=folp)

            # adjust ndvi to entropy input types
            ndvi = ndvi*100 +100
            sr.entropy_raster(rast_data=ndvi.astype('uint8'), 
                              crs=rast.rio.crs, transf=rast.rio.transform(), 
                              rast_name=band_names[4], 
                              n=entropy_r, 
                              folder_path=folp)
            # ---------------------------------------
            # sample raster values for current point
            samples = []
            for col_name in window_cols:
                fp = os.path.join(folp, col_name+'s.tif')
                aux_rast_r = rio.open(fp)
                sample = sr.sample_raster_from_pts(pt, aux_rast_r, [col_name])    
                os.remove(fp)
                samples.append(sample)       

            # ---------------------------------------
            # Add all derived spectral data to pts dataframe
            window_features.append(pd.concat(samples, axis = 1)) 
        # ---------------------------------------------------                    
        # ---------------------------------------------------
        # concatenate sampled data
        window_features = pd.concat(window_features).set_index(to_sample.index)
        #lidar_bands = pd.concat(canopy_h_samples, axis=1).set_index(to_sample.index)
        df = pd.concat([to_sample, spectral_bands, window_features], axis=1)
        # ---------------------------------------------------
        # add date and naipid information
        kwargs = {'year' : item.datetime.year,
                  'month' : item.datetime.month,
                  'day_in_year' : sr.day_in_year(item.datetime.day, item.datetime.month, item.datetime.year),
                  'naip_id' : itemid}
        df = df.assign(**kwargs)
        # ---------------------------------------------------
        sampled_points.append(gpd.GeoDataFrame(df))
        
    # ---------------------------------------
    # processing message
    N = N-1                
    print('REMAINING: ', N, 'scenes', end="\r")        


samples = pd.concat(sampled_points).sort_index()

# ---------------------------------------
# create iceplant column
if iceplant_param == 'set_to_constant':
    samples['iceplant'] = iceplant_val
elif iceplant_param == 'included':
    samples['iceplant'] = pts.iceplant

# ---------------------------------------
# create max-min difference columns
# for band in band_names:
#     col_name = band + '_diff'
#     samples[col_name] = samples[band +'_max'] - samples[band +'_min']

REMAINING:  0 scenes

In [6]:
# ---------------------------------------
samples = samples.assign( x = lambda df : df.geometry.x, 
                          y = lambda df : df.geometry.y,
                         pts_crs = crs.to_string())

# ---------------------------------------
# create ndvi column
# make this with assign
samples['ndvi'] = (samples.nir.astype('int16') - samples.r.astype('int16'))/(samples.nir.astype('int16') + samples.r.astype('int16'))

print('FINISHED PROCESSING')       

FINISHED PROCESSING


In [7]:
for band in band_names:
    samples = samples.rename({band+'_avg': band+'_avg'+str(box_side), 
                              band+'_entr': band+'_entr'+str(box_side)}, 
                             axis=1)

In [8]:
samples.columns

Index(['geometry', 'r', 'g', 'b', 'nir', 'r_avg13', 'r_entr13', 'g_avg13',
       'g_entr13', 'b_avg13', 'b_entr13', 'nir_avg13', 'nir_entr13',
       'ndvi_avg13', 'ndvi_entr13', 'year', 'month', 'day_in_year', 'naip_id',
       'iceplant', 'x', 'y', 'pts_crs', 'ndvi'],
      dtype='object')

In [9]:
features = ['x', 'y', 'pts_crs',
                   'naip_id',
                    'r', 
                    #'r_max', 'r_min', 'r_diff', 
                    'r_avg'+str(box_side), 'r_entr'+str(box_side), # spectral
                    'g', 
                    #'g_max', 'g_min', 'g_diff', 
                    'g_avg'+str(box_side), 'g_entr'+str(box_side),
                    'b', 
                    #'b_max', 'b_min', 'b_diff', 
                    'b_avg'+str(box_side), 'b_entr'+str(box_side),
                    'nir', 
                    #'nir_max', 'nir_min', 'nir_diff', 
                    'nir_avg'+str(box_side), 'nir_entr'+str(box_side),
                    'ndvi', 
                    #'ndvi_max', 'ndvi_min', 'ndvi_diff', 
                    'ndvi_avg'+str(box_side), 'ndvi_entr'+str(box_side),                     
                    'year', 'month', 'day_in_year',
                    'iceplant']

if iceplant_param == 'ignore':
    features.remove('iceplant')

samples = samples[features]
samples

Unnamed: 0,x,y,pts_crs,naip_id,r,r_avg13,r_entr13,g,g_avg13,g_entr13,...,nir,nir_avg13,nir_entr13,ndvi,ndvi_avg13,ndvi_entr13,year,month,day_in_year,iceplant
0,-120.251567,34.472316,EPSG:4326,ca_m_3412039_nw_10_060_20200522,139,153.940826,5.096221,118,136.869827,5.254605,...,167,165.130173,3.701939,0.091503,0.035503,4.144262,2020,5,143,0
1,-120.251699,34.472494,EPSG:4326,ca_m_3412039_nw_10_060_20200522,116,117.272186,5.440273,119,119.485207,5.147836,...,161,159.094681,4.401204,0.162455,0.153846,4.812214,2020,5,143,0
2,-120.252003,34.472158,EPSG:4326,ca_m_3412039_nw_10_060_20200522,119,105.526627,5.740416,114,108.514793,5.294940,...,172,171.449707,3.183500,0.182131,0.254438,5.180070,2020,5,143,0
3,-120.251210,34.476551,EPSG:4326,ca_m_3412039_nw_10_060_20200522,131,140.994080,5.535449,127,131.585800,5.231338,...,154,154.242599,4.788577,0.080702,0.047337,4.163909,2020,5,143,0
4,-120.251851,34.476266,EPSG:4326,ca_m_3412039_nw_10_060_20200522,115,143.230774,5.691576,124,138.763321,5.429025,...,178,164.461533,5.229712,0.215017,0.082840,4.615462,2020,5,143,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,-119.504150,34.391185,EPSG:4326,ca_m_3411936_se_11_060_20200521,91,89.781067,4.009650,79,74.869820,3.569486,...,157,157.289948,4.573819,0.266129,0.272189,3.639899,2020,5,142,0
624,-119.504222,34.391116,EPSG:4326,ca_m_3411936_se_11_060_20200521,89,84.662720,4.309521,74,71.852074,3.599072,...,157,163.224854,4.345073,0.276423,0.313609,3.613855,2020,5,142,0
625,-119.503308,34.391315,EPSG:4326,ca_m_3411936_se_11_060_20200521,53,128.343201,5.595022,75,130.426041,5.340758,...,143,162.000000,4.558819,0.459184,0.136095,4.686324,2020,5,142,0
626,-119.503869,34.390912,EPSG:4326,ca_m_3411936_se_11_060_20200521,74,88.307693,4.730565,92,99.952660,4.273396,...,156,144.769226,4.807467,0.356522,0.260355,4.328707,2020,5,142,0


In [10]:
samples.columns

Index(['x', 'y', 'pts_crs', 'naip_id', 'r', 'r_avg13', 'r_entr13', 'g',
       'g_avg13', 'g_entr13', 'b', 'b_avg13', 'b_entr13', 'nir', 'nir_avg13',
       'nir_entr13', 'ndvi', 'ndvi_avg13', 'ndvi_entr13', 'year', 'month',
       'day_in_year', 'iceplant'],
      dtype='object')

In [11]:
# you can add/drop more columns here

# samples['LSWE_class'] = pts.map_class
# samples['ref_class'] = pts.ref_class
# samples = samples.drop(['iceplant'], axis=1)

In [12]:
if save:
    samples.to_csv(os.path.join(os.getcwd(), folder, csv_name), index=False) 