# About

This notebook assembles all the csvs produced by the `3_add_lidar.ipynb` notebook into a single dataframe and saves it as a single csv. It also includes some statistics of the combined dataset. The notebook assumes all the posisble aois and years have been sampled and have a csv of points associated with them.

**VARIABLES**

`delete_files` (bool): whether to delete the individual csvs from which the final dataset is assembled

**OUTPUT:**
A single csv named 'samples_for_model.csv'. This contains all the points with all the features sampled. 

In [1]:
import numpy as np
import pandas as pd
import os

from utility import iceplant_proportions

# Specify notebook variables

In [2]:
# ***************************************************
# ************* NOTEBOOK VARIABLES ******************

delete_files = False

# ***************************************************
# ***************************************************

# Assemble data frame with all sampled points

In [3]:
def path_spectral_points_csv(aoi, year):
    """ Assembles a file path to file with points and ONLY spectral information from given aoi and year. """
    fp = os.path.join(os.getcwd(), 
                      'temp',
                      aoi +'_points_'+str(year)+'.csv')
    return fp            

#---------------------

def path_lidar_spectral_points_csv(aoi, year):
    """ Assembles a file path to file with points and canopy height AND spectral information from given aoi and year. """
    fp = os.path.join(os.getcwd(), 
                      'temp',
                      aoi +'_pts_spectral_lidar_'+str(year)+'.csv')
    return fp            

In [4]:
lidar_years = [2016,2018,2020]
spec_years = [2012,2014]
aois = ['campus_lagoon','carpinteria','gaviota','point_conception']

# *******************************************************************************
# Open and concatenate csv files of points with canopy height + spectral info
li = []
for aoi in aois:
    for year in lidar_years:
        if ('point_conception' != aoi) or (year != 2016):  #there's no data for Point Conception on 2016
            sample = pd.read_csv(path_lidar_spectral_points_csv(aoi,year))
            li.append(sample)

df_lidar = pd.concat(li, axis=0)

# only keep points with non-negative canopy height values
df_lidar = df_lidar.loc[(df_lidar["lidar"] >= 0) & 
                        (df_lidar["max_lidar"] >= 0) &
                        (df_lidar["min_lidar"] >= 0) &
                        (df_lidar["avg_lidar"] >= 0) &
                        (df_lidar["min_max_diff"] >= 0)
                       ]

# *******************************************************************************
# Open and concatenate csv files of points with canopy height + spectral info
li = []
for aoi in aois:
    for year in spec_years:
            sample = pd.read_csv(path_spectral_points_csv(aoi,year))
            li.append(sample)

df_spec = pd.concat(li, axis=0)

# fill in canopy height columns wit NaN
df_spec['lidar'] = np.nan
df_spec['max_lidar'] = np.nan
df_spec['min_lidar'] = np.nan
df_spec['avg_lidar'] = np.nan
df_spec['min_max_diff'] = np.nan

# *******************************************************************************
# concatenate both data frames and clean index and columns
df = pd.concat([df_lidar,df_spec], axis=0)

df.reset_index(drop=True, inplace=True)

df = df[['x', 'y', 'pts_crs', #  point location
         'aoi','naip_id', 'polygon_id',  # sampling info
         'r','g','b','nir','ndvi',     # spectral
         'year','month','day_in_year', # date
         'lidar', 'max_lidar', 'min_lidar', 'min_max_diff', 'avg_lidar', # lidar
         'iceplant'
         ]] 

# *******************************************************************************
# save data frame as csv
df.to_csv(os.path.join(os.getcwd(),'samples_for_model.csv'), index=False)


# *******************************************************************************
# Delete individual files
if delete_files == True:
    for aoi in aois:
        for year in years:
            if year in spec_years:
                os.remove(path_spectral_points_csv(aoi,year))
            if year in lidar_years:
                if ('point_conception' != aoi) or (year != 2016):  #there's no data for Point Conception on 2016
                    os.remove(path_lidar_spectral_points_csv(aoi,year))

# Statistics about data distribution
## non-ice plant : iceplant proportion

In [17]:
iceplant_proportions(df.iceplant)

no-iceplant:iceplant ratio     1.8 :1
          counts  percentage
iceplant                    
0         313132        63.8
1         177691        36.2



## Number of points by area of interest

In [18]:
#checking all data was loaded
df.aoi.value_counts()

point_conception    192684
campus_lagoon       132816
carpinteria         102192
gaviota              63131
Name: aoi, dtype: int64

## Number of points by year

In [19]:
df.year.value_counts()

2020    150684
2018    121937
2014     89193
2012     77247
2016     51762
Name: year, dtype: int64

## Number of points by NAIP scene

In [20]:
df.naip_id.value_counts()

ca_m_3412037_nw_10_060_20200607             81605
ca_m_3411934_sw_11_060_20180722_20190209    54950
ca_m_3412037_nw_10_1_20140603_20141030      50853
ca_m_3412037_nw_10_1_20120518_20120730      31965
ca_m_3412037_nw_10_060_20180913_20190208    28261
ca_m_3411936_se_11_060_20200521             27995
ca_m_3411934_sw_11_060_20200521             25767
ca_m_3411936_se_11_060_20180724_20190209    23836
ca_m_3411934_sw_11_.6_20160713_20161004     20283
ca_m_3411936_se_11_.6_20160713_20161004     18421
ca_m_3411934_sw_11_1_20120505_20120730      18379
ca_m_3411936_se_11_1_20120505_20120730      16576
ca_m_3411936_se_11_1_20140901_20141030      15364
ca_m_3412039_nw_10_060_20200522             15317
ca_m_3412039_nw_10_060_20180724_20190209    14890
ca_m_3411934_sw_11_1_20140601_20141030      13437
ca_m_3412039_nw_10_.6_20160616_20161004     13058
ca_m_3412039_nw_10_1_20120518_20120730      10327
ca_m_3412039_nw_10_1_20140603_20141030       9539
Name: naip_id, dtype: int64

## Number of NAIP scenes sampled

In [21]:
len(df.naip_id.value_counts())

19