# About

This notebook assembles all the csvs produced by the `3_add_lidar.ipynb` notebook into a single dataframe and saves it as a single csv. It also includes some statistics of the combined dataset. The notebook assumes all the posisble aois and years have been sampled and have a csv of points associated with them.

**VARIABLES**

`delete_files` (bool): whether to delete the individual csvs from which the final dataset is assembled

**OUTPUT:**
A single csv named 'samples_for_model.csv'. This contains all the points with all the features sampled. 

In [1]:
import numpy as np
import pandas as pd
import os

from extracting_points_from_polygons import iceplant_proportions

# Specify notebook variables

In [2]:
# ***************************************************
# ************* NOTEBOOK VARIABLES ******************

delete_files = False

# ***************************************************
# ***************************************************

# Assemble data frame with all sampled points

In [3]:
def path_to_points_csv(aoi, year):
    # root for all polygons collected on naip scenes
    fp = os.path.join(os.getcwd(), 
                      'temp',
#                      aoi +'_pts_spectral_lidar_'+str(year)+'.csv')
                      aoi +'_points_'+str(year)+'.csv')
    return fp            

In [4]:
years = [2012,2014,2016,2018,2020]
aois = ['campus_lagoon','carpinteria','gaviota','point_conception']

In [5]:
# Open and concatenate csvs
li = []

for aoi in aois:
    for year in years:
        if ('point_conception' != aoi) or (year != 2016):  #there's no data for Point Conception on 2016
            sample = pd.read_csv(path_to_points_csv(aoi,year))
            li.append(sample)

df = pd.concat(li, axis=0)

In [6]:
# Delete individual files
if delete_files == True:
    for aoi in aois:
        for year in years:
            if ('point_conception' != aoi) or (year != 2016):  #there's no data for Point Conception on 2016
                os.remove(path_to_points_csv(aoi,year))

In [7]:
# Dataframe cleaning
df.reset_index(drop=True, inplace=True)
df.drop(['Unnamed: 0'],axis=1, inplace=True)

In [None]:
# Order columns

df = df[['r','g','b','nir','ndvi',     # spectral
         'year','month','day_in_year', # date
         'lidar', 'max_lidar', 'min_lidar', 'min_max_diff', 'avg_lidar', # lidar
         'iceplant',                  
         'geometry',         # point coords (CRS is one from scene in naip_id)
         'aoi','naip_id', 'polygon_id']] #
df.head(2)

In [11]:
df.columns

Index(['r', 'g', 'b', 'nir', 'year', 'month', 'day_in_year', 'iceplant',
       'geometry', 'aoi', 'naip_id', 'polygon_id'],
      dtype='object')

# Save csv

In [9]:
df.to_csv(os.path.join(os.getcwd(),'samples_for_model.csv'), index=False)

# Statistics about data distribution
## non-ice plant : iceplant proportion

In [10]:
iceplant_proportions(df.iceplant)

no-iceplant:iceplant ratio     1.8 :1
          counts  percentage
iceplant                    
0         313132        63.8
1         177691        36.2



## Number of points by area of interest

In [11]:
#checking all data was loaded
df.aoi.value_counts()

point_conception    192684
campus_lagoon       132816
carpinteria         102192
gaviota              63131
Name: aoi, dtype: int64

## Number of points by year

In [12]:
df.year.value_counts()

2020    150684
2018    121937
2014     89193
2012     77247
2016     51762
Name: year, dtype: int64

## Number of points by NAIP scene

In [13]:
df.naip_id.value_counts()

ca_m_3412037_nw_10_060_20200607             81605
ca_m_3411934_sw_11_060_20180722_20190209    54950
ca_m_3412037_nw_10_1_20140603_20141030      50853
ca_m_3412037_nw_10_1_20120518_20120730      31965
ca_m_3412037_nw_10_060_20180913_20190208    28261
ca_m_3411936_se_11_060_20200521             27995
ca_m_3411934_sw_11_060_20200521             25767
ca_m_3411936_se_11_060_20180724_20190209    23836
ca_m_3411934_sw_11_.6_20160713_20161004     20283
ca_m_3411936_se_11_.6_20160713_20161004     18421
ca_m_3411934_sw_11_1_20120505_20120730      18379
ca_m_3411936_se_11_1_20120505_20120730      16576
ca_m_3411936_se_11_1_20140901_20141030      15364
ca_m_3412039_nw_10_060_20200522             15317
ca_m_3412039_nw_10_060_20180724_20190209    14890
ca_m_3411934_sw_11_1_20140601_20141030      13437
ca_m_3412039_nw_10_.6_20160616_20161004     13058
ca_m_3412039_nw_10_1_20120518_20120730      10327
ca_m_3412039_nw_10_1_20140603_20141030       9539
Name: naip_id, dtype: int64

## Number of NAIP scenes sampled

In [17]:
len(df.naip_id.value_counts())

19