In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def path_points_csv(aoi, year):
    # root for all polygons collected on naip scenes
    root = '/home/jovyan/msai4earth-esa/iceplant_detection/data_sampling/points_from_naip_images'
    fp = os.path.join(root, 
                      aoi +'_points',
                      aoi +'_pts_spectral_lidar_'+str(year)+'.csv')
    return fp            

In [3]:
import calendar

def day_in_year(day,month,year):
    days_in_month = [31,28,31,30,31,30,31,31,30,31,30,31]
    n = 0
    for i in range(0,month-1):
        n = n+days_in_month[i]
    n = n+day
    if calendar.isleap(year) and month>2:
        n = n+1
    return n

## Assemble data frame with all sampled points

In [4]:
years = [2012,2014,2016,2018,2020]

In [5]:
# initialize dataframe
aoi = 'campus_lagoon'
df = pd.read_csv(path_points_csv(aoi,2012))
df['day_in_year'] = day_in_year(df.day[0], df.month[0], df.year[0])
df['aoi'] = aoi

# add other campus lagoon sample pts
for i in range(1,5):
    df2 = pd.read_csv(path_points_csv(aoi,years[i]))
    df2['day_in_year'] = day_in_year(df2.day[0],df2.month[0], df2.year[0])
    df2['aoi'] = aoi
    df = pd.concat([df,df2])

In [6]:
# add point conception sample pts
aoi = 'carpinteria'
for i in range(0,5):
    df2 = pd.read_csv(path_points_csv(aoi,years[i]))
    df2['day_in_year'] = day_in_year(df2.day[0],df2.month[0], df2.year[0])
    df2['aoi'] = aoi
    df = pd.concat([df,df2])

In [7]:
# add point conception sample pts
aoi = 'gaviota'
for i in range(0,5):
    df2 = pd.read_csv(path_points_csv(aoi,years[i]))
    df2['day_in_year'] = day_in_year(df2.day[0],df2.month[0], df2.year[0])
    df2['aoi'] = aoi
    df = pd.concat([df,df2])

In [8]:
years = [2012,2014,2018,2020]

# add point conception sample pts
aoi = 'point_conception'
for i in range(0,4):
    df2 = pd.read_csv(path_points_csv(aoi,years[i]))
    df2['day_in_year'] = day_in_year(df2.day[0],df2.month[0], df2.year[0])
    df2['aoi'] = aoi
    df = pd.concat([df,df2])

In [9]:
df.reset_index(drop=True,inplace=True)

In [10]:
df.drop(['Unnamed: 0'],axis=1, inplace=True)
df.head(3)

Unnamed: 0,iceplant,r,g,b,nir,year,month,day,naip_id,polygon_id,geometry,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,day_in_year,aoi
0,1,134,125,103,170,2012,5,5,ca_m_3411934_sw_11_1_20120505_20120730,0,POINT (238565.79498225075 3810768.627232482),2,2,0,2,1.111111,126,campus_lagoon
1,1,130,114,101,164,2012,5,5,ca_m_3411934_sw_11_1_20120505_20120730,0,POINT (238553.15545424985 3810802.7926417096),2,3,0,3,0.888889,126,campus_lagoon
2,1,132,110,98,160,2012,5,5,ca_m_3411934_sw_11_1_20120505_20120730,0,POINT (238552.77597268307 3810773.0767946127),1,3,0,3,1.222222,126,campus_lagoon


In [12]:
df.columns

Index(['iceplant', 'r', 'g', 'b', 'nir', 'year', 'month', 'day', 'naip_id',
       'polygon_id', 'geometry', 'lidar', 'max_lidar', 'min_lidar',
       'min_max_diff', 'avg_lidar', 'day_in_year', 'aoi'],
      dtype='object')

## Stats about sample distribution among aois and scenes

In [13]:
#checking all data was loaded
df.aoi.value_counts()

point_conception    207450
campus_lagoon       163518
carpinteria         120476
gaviota              73740
Name: aoi, dtype: int64

In [14]:
df.year.value_counts()

2020    160058
2018    149962
2014    101801
2012     86932
2016     66431
Name: year, dtype: int64

In [15]:
df.naip_id.value_counts()

ca_m_3412037_nw_10_060_20200607             75414
ca_m_3411934_sw_11_060_20180722_20190209    69391
ca_m_3412037_nw_10_1_20140603_20141030      61529
ca_m_3412037_nw_10_1_20120518_20120730      37126
ca_m_3412037_nw_10_060_20180913_20190208    33381
ca_m_3411934_sw_11_060_20200521             33293
ca_m_3411936_se_11_060_20200521             32332
ca_m_3411936_se_11_060_20180724_20190209    29081
ca_m_3411934_sw_11_.6_20160713_20161004     26540
ca_m_3411936_se_11_.6_20160713_20161004     23529
ca_m_3411934_sw_11_1_20120505_20120730      19816
ca_m_3411936_se_11_1_20120505_20120730      19020
ca_m_3412039_nw_10_060_20200522             19019
ca_m_3412039_nw_10_060_20180724_20190209    18109
ca_m_3411936_se_11_1_20140901_20141030      16514
ca_m_3412039_nw_10_.6_20160616_20161004     16362
ca_m_3411934_sw_11_1_20140601_20141030      14478
ca_m_3412039_nw_10_1_20120518_20120730      10970
ca_m_3412039_nw_10_1_20140603_20141030       9280
Name: naip_id, dtype: int64

## Add NDVI feature

In [16]:
df['ndvi']=(df.nir.astype('int16') - df.r.astype('int16'))/(df.nir.astype('int16') + df.r.astype('int16'))
df.head(3)

Unnamed: 0,iceplant,r,g,b,nir,year,month,day,naip_id,polygon_id,geometry,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,day_in_year,aoi,ndvi
0,1,134,125,103,170,2012,5,5,ca_m_3411934_sw_11_1_20120505_20120730,0,POINT (238565.79498225075 3810768.627232482),2,2,0,2,1.111111,126,campus_lagoon,0.118421
1,1,130,114,101,164,2012,5,5,ca_m_3411934_sw_11_1_20120505_20120730,0,POINT (238553.15545424985 3810802.7926417096),2,3,0,3,0.888889,126,campus_lagoon,0.115646
2,1,132,110,98,160,2012,5,5,ca_m_3411934_sw_11_1_20120505_20120730,0,POINT (238552.77597268307 3810773.0767946127),1,3,0,3,1.222222,126,campus_lagoon,0.09589


In [18]:
# ORDER COLUMNS

df = df[['r','g','b','nir','ndvi',     # spectral
         'year','month','day_in_year', # date
         'lidar', 'max_lidar', 'min_lidar', 'min_max_diff', 'avg_lidar', # lidar
         'iceplant',                  
         'geometry',         # point coords (CRS is one from scene in naip_id)
         'aoi','naip_id', 'polygon_id']] #
df.head(3)

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,iceplant,geometry,aoi,naip_id,polygon_id
0,134,125,103,170,0.118421,2012,5,126,2,2,0,2,1.111111,1,POINT (238565.79498225075 3810768.627232482),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,0
1,130,114,101,164,0.115646,2012,5,126,2,3,0,3,0.888889,1,POINT (238553.15545424985 3810802.7926417096),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,0
2,132,110,98,160,0.09589,2012,5,126,1,3,0,3,1.222222,1,POINT (238552.77597268307 3810773.0767946127),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,0


In [19]:
df.to_csv(os.path.join(os.getcwd(),'samples_for_model.csv'), index=False)