In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

This notebook samples the same specified percentage of points per scene, in an effort to keep the training and test sets unbiased towards scenes that have more points sampled from them.

In [2]:
samples = pd.read_csv(os.path.join(os.getcwd(),'samples_for_model.csv'))
# created at feature_selection.ipynb

In [3]:
samples.shape

(565184, 18)

In [4]:
def iceplant_proportions(labels):
    unique, counts = np.unique(labels, return_counts=True)
    print('no-iceplant:iceplant ratio    ',round(counts[0]/counts[1],1),':1')
    n = labels.shape[0]
    perc = [round(counts[0]/n*100,2), round(counts[1]/n*100,2)]
    df = pd.DataFrame({'iceplant':unique,
             'counts':counts,
             'percentage':perc}).set_index('iceplant')
    print(df)
    print()

In [5]:
all_train = []
all_test = []

X_labels = samples.columns.drop('iceplant')

aois = samples.aoi.unique()

for aoi in aois:
    # retrieve all scenes from  aoi
    in_aoi = samples[samples.aoi == aoi]    
    scenes = in_aoi.naip_id.unique()
    for scene in scenes:
        # get all pts in scene
        in_scene = in_aoi[in_aoi.naip_id == scene]
        
        # sample 70% of pts in scene for training
        # keep same iceplant/non-ice plant proportion in test an train sets
        X = np.array(in_scene.drop('iceplant', axis = 1))
        y = np.array(in_scene['iceplant'])
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size = 0.3, 
                                                            random_state = 42)
        
        # reassemble train set into data frame
        train = pd.DataFrame(X_train, columns = X_labels)
        train['iceplant'] = y_train
       
        # reassemble test set into data 
        test = pd.DataFrame(X_test, columns = X_labels)
        test['iceplant'] = y_test
        
        # add to rest of train/test pts
        all_train.append(train)
        all_test.append(test)
        
train = pd.concat(all_train, ignore_index=True)
test = pd.concat(all_test, ignore_index=True)

In [6]:
iceplant_proportions(np.array(train.iceplant))

no-iceplant:iceplant ratio     2.6 :1
          counts  percentage
iceplant                    
0         285908       72.27
1         109714       27.73



In [7]:
iceplant_proportions(np.array(test.iceplant))

no-iceplant:iceplant ratio     2.6 :1
          counts  percentage
iceplant                    
0         122776       72.41
1          46786       27.59



In [8]:
train.head(3)

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,geometry,aoi,naip_id,polygon_id,iceplant
0,43,51,78,89,0.348485,2012,5,126,2,10,0,10,3.0,POINT (235774.86665080223 3811711.199119436),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,15,0
1,65,59,84,79,0.097222,2012,5,126,0,6,0,6,1.0,POINT (238133.2205170226 3811247.685769488),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,29,0
2,81,86,93,132,0.239437,2012,5,126,11,14,6,8,9.222222,POINT (236234.3014255809 3811420.479858158),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,16,0


In [9]:
test.head(3)

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,geometry,aoi,naip_id,polygon_id,iceplant
0,124,104,99,154,0.107914,2012,5,126,0,0,0,0,0.0,POINT (238531.52353940875 3810778.486143305),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,1,1
1,135,119,99,167,0.10596,2012,5,126,1,3,0,3,1.222222,POINT (238550.1707933048 3810782.096102707),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,0,1
2,96,98,100,127,0.139013,2012,5,126,3,3,2,1,2.333333,POINT (238105.80866084044 3814676.1355223465),campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,23,0


In [10]:
train.to_csv(os.path.join(os.getcwd(),'train_set.csv'), index=False)

In [11]:
test.to_csv(os.path.join(os.getcwd(),'test_set.csv'), index=False)

In [13]:
os.remove(os.path.join(os.getcwd(),'samples_for_model.csv'))