# About

This notebook samples the same specified percentage of points per scene to go into the training set. This is an effort to keep the training and test sets unbiased towards scenes that have more points sampled from them. The notebook assumes all the data points are in a csv file named `samples_for_model.csv`. 

**VARIABLES**
- `test_size` (float in (0,1)): percentage of data samples that should go into test set. The notebook will sample this percentage of test points from each scene. 

- `delete_files` (bool): whether to delete the file with all original the sample data. 

**OUTPUT**
The notebook generates two csv files one for the train set and another for the test set.


In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from extracting_points_from_polygons import iceplant_proportions

# Specify notebook variables

In [2]:
# ***************************************************
# ************* NOTEBOOK VARIABLES ******************


test_size = 0.3

delete_files = True

# ***************************************************
# ***************************************************

# Open data samples 

In [3]:
samples = pd.read_csv(os.path.join(os.getcwd(),'samples_for_model.csv'))

# Separate into train and test set keeping same proportion per scene

In [4]:
# initialize empy train and test lists
all_train = []
all_test = []

# save label names
X_labels = samples.columns.drop('iceplant')

# list of aois
aois = samples.aoi.unique()

for aoi in aois:
    
    # retrieve all scenes from  aoi
    in_aoi = samples[samples.aoi == aoi]    
    scenes = in_aoi.naip_id.unique()
    
    for scene in scenes:
        # get all pts in scene
        in_scene = in_aoi[in_aoi.naip_id == scene]
        
        # sample test_size fraction of pts in scene for testing
        # keep same iceplant/non-ice plant proportion in test an train sets
        X = np.array(in_scene.drop('iceplant', axis = 1))
        y = np.array(in_scene['iceplant'])
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size = test_size, 
                                                            random_state = 42)
        
        # reassemble train set into data frame
        train = pd.DataFrame(X_train, columns = X_labels)
        train['iceplant'] = y_train
       
        # reassemble test set into data 
        test = pd.DataFrame(X_test, columns = X_labels)
        test['iceplant'] = y_test
        
        # add to rest of train/test pts
        all_train.append(train)
        all_test.append(test)
        
train = pd.concat(all_train, ignore_index=True)
test = pd.concat(all_test, ignore_index=True)

# Check sets

In [5]:
train.head(3)

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,geometry,aoi,naip_id,polygon_id,iceplant
0,86,93,103,115,0.144279,2012,5,126,1,2,1,1,1.222222,POINT (237384.81492726543 3812901.816317874),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,28,0
1,123,116,106,169,0.157534,2012,5,126,2,6,0,6,2.555556,POINT (239452.35860786948 3812008.182551823),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,9,1
2,120,106,99,158,0.136691,2012,5,126,0,2,0,2,0.555556,POINT (238500.30868452237 3810836.327497523),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,3,1


In [6]:
test.head(3)

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,geometry,aoi,naip_id,polygon_id,iceplant
0,84,81,99,147,0.272727,2012,5,126,9,25,0,25,8.555555,POINT (237704.5929384376 3811326.0729273846),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,32,0
1,56,58,78,75,0.145038,2012,5,126,3,6,0,6,2.0,POINT (238152.31719737497 3811244.452314676),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,29,0
2,57,60,83,126,0.377049,2012,5,126,0,7,0,7,1.666667,POINT (235775.6949041834 3811704.484703782),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,15,0


# Check non-iceplant:iceplant ratio in train and test sets

In [7]:
iceplant_proportions(np.array(train.iceplant))

no-iceplant:iceplant ratio     1.8 :1
          counts  percentage
iceplant                    
0         219376       63.85
1         124191       36.15



In [8]:
iceplant_proportions(np.array(test.iceplant))

no-iceplant:iceplant ratio     1.8 :1
          counts  percentage
iceplant                    
0          93756       63.67
1          53500       36.33



# Save train and test sets

In [10]:
train.to_csv(os.path.join(os.getcwd(),'train_set.csv'), index=False)
test.to_csv(os.path.join(os.getcwd(),'test_set.csv'), index=False)

# Delete original dataset

In [13]:
if delete_files == True:
    os.remove(os.path.join(os.getcwd(),'samples_for_model.csv'))