# About

This notebook assembles all the csvs produced by the `3_add_lidar.ipynb` notebook into a single dataframe and saves it as a single csv. It also includes some statistics of the combined dataset. The notebook assumes all the posisble aois and years have been sampled and have a csv of points associated with them.

**VARIABLES**

`delete_files` (bool): whether to delete the individual csvs from which the final dataset is assembled

**OUTPUT:**
A single csv named 'samples_for_model.csv'. This contains all the points with all the features sampled. 

In [1]:
import numpy as np
import pandas as pd
import os

from utility import iceplant_proportions

# Specify notebook variables

In [7]:
# ***************************************************
# ************* NOTEBOOK VARIABLES ******************

delete_files = False

# print stats as notebook runs
verbose = True

# save stats
write_stats = True
# ***************************************************
# ***************************************************

## Paths to sample points

In [3]:
def path_spectral_points_csv(aoi, year):
    """ Assembles a file path to file with points and ONLY spectral information from given aoi and year. """
    fp = os.path.join(os.getcwd(), 
                      'temp',
                      aoi +'_points_'+str(year)+'.csv')
    return fp            

#---------------------

def path_lidar_spectral_points_csv(aoi, year):
    """ Assembles a file path to file with points and canopy height AND spectral information from given aoi and year. """
    fp = os.path.join(os.getcwd(), 
                      'temp',
                      aoi +'_pts_spectral_lidar_'+str(year)+'.csv')
    return fp            

# Assemble data frame with all sampled points

In [4]:
lidar_years = [2016,2018,2020]
spec_years = [2012,2014]
aois = ['campus_lagoon','carpinteria','gaviota','point_conception']

# *******************************************************************************
# Open and concatenate csv files of points with canopy height + spectral info
li = []
for aoi in aois:
    for year in lidar_years:
        if ('point_conception' != aoi) or (year != 2016):  #there's no data for Point Conception on 2016
            sample = pd.read_csv(path_lidar_spectral_points_csv(aoi,year))
            li.append(sample)

df_lidar = pd.concat(li, axis=0)

# only keep points with non-negative canopy height values
df_lidar = df_lidar.loc[(df_lidar["lidar"] >= 0) & 
                        (df_lidar["max_lidar"] >= 0) &
                        (df_lidar["min_lidar"] >= 0) &
                        (df_lidar["avg_lidar"] >= 0) &
                        (df_lidar["min_max_diff"] >= 0)
                       ]

# *******************************************************************************
# Open and concatenate csv files of points with canopy height + spectral info
li = []
for aoi in aois:
    for year in spec_years:
            sample = pd.read_csv(path_spectral_points_csv(aoi,year))
            li.append(sample)

df_spec = pd.concat(li, axis=0)

# fill in canopy height columns wit NaN
df_spec['lidar'] = np.nan
df_spec['max_lidar'] = np.nan
df_spec['min_lidar'] = np.nan
df_spec['avg_lidar'] = np.nan
df_spec['min_max_diff'] = np.nan

# *******************************************************************************
# concatenate both data frames and clean index and columns
df = pd.concat([df_lidar,df_spec], axis=0)

df.reset_index(drop=True, inplace=True)

df = df[['x', 'y', 'pts_crs', #  point location
         'aoi','naip_id', 'polygon_id',  # sampling info
         'r','g','b','nir','ndvi',     # spectral
         'year','month','day_in_year', # date
         'lidar', 'max_lidar', 'min_lidar', 'min_max_diff', 'avg_lidar', # lidar
         'iceplant'
         ]] 

# *******************************************************************************
# save data frame as csv
df.to_csv(os.path.join(os.getcwd(),'samples_for_model.csv'), index=False)

# *******************************************************************************
# Delete individual files
if delete_files == True:
    for aoi in aois:
        for year in years:
            if year in spec_years:
                os.remove(path_spectral_points_csv(aoi,year))
            if year in lidar_years:
                if ('point_conception' != aoi) or (year != 2016):  #there's no data for Point Conception on 2016
                    os.remove(path_lidar_spectral_points_csv(aoi,year))

# Statistics about data distribution

In [8]:
sep = '\n\n\n'

n_sample =  '# points sampled: '+ str(df.shape[0]) + sep

# ----------------------------
# ratios and percentages of iceplant vs non-iceplant
unique, counts = np.unique(df.iceplant, return_counts=True)
icep_ratio = 'no-iceplant:iceplant ratio   '+ str(round(counts[0]/counts[1],1))+':1' +sep

n = df.iceplant.shape[0]
perc = [round(counts[0]/n*100,2), round(counts[1]/n*100,2)]
counts_percents = pd.DataFrame({'iceplant':unique,
         'counts':counts,
         'percentage':perc}).set_index('iceplant')
counts_percents = counts_percents.to_string() + sep

# ----------------------------
# Number of points by area of interest
counts_aoi = 'Points sampled per area of interest\n' + df.aoi.value_counts().to_string() + sep

# Number of points by year
counts_year = 'Points sampled per year\n'+ df.year.value_counts().to_string() + sep

# Number of points by NAIP scene
counts_naipid = '# NAIP scenes sampled: '+ str(len(df.naip_id.value_counts()))+'\nPoints sampled per NAIP scene\n' +  df.naip_id.value_counts().to_string() + sep


# ----------------------------
# assemble all stats into string
stats = n_sample + icep_ratio + counts_percents + counts_aoi + counts_year + counts_naipid

# *******************************************************************************
if write_stats:
    with open(os.path.join(os.getcwd(),'stats.txt'), 'a') as f:
        f.write(stats)
        f.close()

# *******************************************************************************
if verbose:
    print(stats)

# points sampled: 489415


no-iceplant:iceplant ratio   1.8:1


          counts  percentage
iceplant                    
0         313132       63.98
1         176283       36.02


Points sampled per area of interest
point_conception    191958
campus_lagoon       132326
carpinteria         102089
gaviota              63042


Points sampled per year
2020    149382
2018    121902
2014     89163
2012     77224
2016     51744


# NAIP scenes sampled: 19
Points sampled per NAIP scene
ca_m_3412037_nw_10_060_20200607             80913
ca_m_3411934_sw_11_060_20180722_20190209    54945
ca_m_3412037_nw_10_1_20140603_20141030      50837
ca_m_3412037_nw_10_1_20120518_20120730      31957
ca_m_3412037_nw_10_060_20180913_20190208    28251
ca_m_3411936_se_11_060_20200521             27930
ca_m_3411934_sw_11_060_20200521             25293
ca_m_3411936_se_11_060_20180724_20190209    23820
ca_m_3411934_sw_11_.6_20160713_20161004     20278
ca_m_3411936_se_11_.6_20160713_20161004     18413
ca_m_3411934_sw