In [1]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import rasterio
import geopandas as gpd

from shapely.geometry import Polygon

import iceplant_detection_functions as ipf
import model_prep_and_evals as mpe

import warnings
import sys

https://jakevdp.github.io/PythonDataScienceHandbook/02.01-understanding-data-types.html

## Create test/train set from samples

In [16]:
samples_fp = os.path.join(os.getcwd(),'feature_selection','samples_for_model.csv')
samples = pd.read_csv(os.path.join(os.getcwd(),'feature_selection','samples_for_model.csv'))
samples.head(3)

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year,iceplant,geometry,x,y,aoi,naip_id,polygon_id
0,134,125,103,170,0.118421,2012,5,126,1,POINT (238565.79498225075 3810768.627232482),238565.794982,3810769.0,campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,0
1,130,114,101,164,0.115646,2012,5,126,1,POINT (238553.15545424985 3810802.7926417096),238553.155454,3810803.0,campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,0
2,132,110,98,160,0.09589,2012,5,126,1,POINT (238552.77597268307 3810773.0767946127),238552.775973,3810773.0,campus_lagoon,ca_m_3411934_sw_11_1_20120505_20120730,0


In [17]:
print(type(samples.r[0]))
sys.getsizeof(samples.r[0])

<class 'numpy.int64'>


32

In [42]:
deep_getsizeof(samples.r[0],set())

32

In [18]:
print(type(samples.ndvi[0]))
sys.getsizeof(samples.ndvi[0])

<class 'numpy.float64'>


32

In [41]:
deep_getsizeof(samples.ndvi[0],set())

32

In [20]:
train, test = mpe.test_train_aois_scenes(pd.read_csv(samples_fp))

In [21]:
X_train = np.array(np.array(train.loc[:,'r':'day_in_year']))
X_train

array([[43, 51, 78, ..., 2012, 5, 126],
       [65, 59, 84, ..., 2012, 5, 126],
       [81, 86, 93, ..., 2012, 5, 126],
       ...,
       [84, 100, 71, ..., 2020, 5, 142],
       [65, 77, 72, ..., 2020, 5, 142],
       [87, 93, 84, ..., 2020, 5, 142]], dtype=object)

In [22]:
print(X_train[0,0])
print(type((X_train[0,0])))
sys.getsizeof(X_train[0,0])

43
<class 'int'>


28

In [23]:
print(X_train[0,4])
print(type((X_train[0,4])))
sys.getsizeof(X_train[0,4])

0.3484848484848485
<class 'float'>


24

In [None]:
# X_test = np.array(np.array(test.loc[:,'r':'day_in_year']))
# X_test

In [None]:
y_train = np.array(train['iceplant'])
y_train

In [None]:
# y_test = np.array(test['iceplant'])
# y_test

In [None]:
mpe.iceplant_proportions(y_train)

## Train model

In [None]:
from dask_ml.wrappers import ParallelPostFit

In [None]:
t0 = time.time()

rfc = ParallelPostFit(RandomForestClassifier(n_estimators = 100, random_state = 42))
rfc.fit(X_train, y_train)

print('time to train: ', (time.time()-t0))

In [None]:
# predictions = rfc.predict(X_test)
# mpe.print_threshold_metrics(y_test, predictions)

In [38]:
# https://code.tutsplus.com/tutorials/understand-how-much-memory-your-python-objects-use--cms-25609
from collections.abc import Mapping, Container
from sys import getsizeof
 
def deep_getsizeof(o, ids):
    
    """Find the memory footprint of a Python object
 
    This is a recursive function that drills down a Python object graph
    like a dictionary holding nested dictionaries with lists of lists
    and tuples and sets.
 
    The sys.getsizeof function does a shallow size of only. It counts each
    object inside a container as pointer only regardless of how big it
    really is.
 
    :param o: the object
    :param ids:
    :return:
    """
    d = deep_getsizeof
    if id(o) in ids:
        return 0
 
    r = getsizeof(o)
    ids.add(id(o))
 
    if isinstance(o, str) or isinstance(0, str):
        return r
 
    if isinstance(o, Mapping):
        return r + sum(d(k, ids) + d(v, ids) for k, v in o.iteritems())
 
    if isinstance(o, Container):
        return r + sum(d(x, ids) for x in o)
 
    return r 

# Pre-process NAIP scene for prediction

In [2]:
itemid = 'ca_m_3412037_nw_10_060_20200607'
item = ipf.get_item_from_id(itemid)
#scene = ipf.get_raster_from_item(item).read([1,2,3,4])

time to retrieve itemid:  1.569197177886963


## Data frame

In [3]:
df = ipf.features_over_aoi(item, 
                           ipf.get_raster_from_item(item).read([1,2,3,4]), 
                           thresh=0.05)
df.head(3)

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year
580,92,91,82,102,0.051546,2020,6,159
588,76,75,71,85,0.055901,2020,6,159
591,96,95,82,107,0.054187,2020,6,159


In [33]:
deep_getsizeof(df, set())/10**6

1017036062

In [7]:
sys.getsizeof(df)/10**6

1017.03564

In [5]:
print(type(df.r.iloc[0]))
sys.getsizeof(df.r.iloc[0])

<class 'numpy.uint8'>


25

In [6]:
print(type(df.ndvi.iloc[0]))
sys.getsizeof(df.ndvi.iloc[0])

<class 'numpy.float64'>


32

## Naive np array

In [8]:
pixels_raw = np.array(df)

In [9]:
sys.getsizeof(pixels_raw)/10**6

1479.324672

In [37]:
deep_getsizeof(pixels_raw, set())/10**6

1479.326128

In [10]:
print(type(pixels_raw[0,0]))
sys.getsizeof(pixels_raw[0,0])

<class 'numpy.float64'>


32

In [11]:
print(type(pixels_raw[0,4]))
sys.getsizeof(pixels_raw[0,4])

<class 'numpy.float64'>


32

## Object type np array

In [12]:
pixels_obj = np.array(df, dtype=object)

In [13]:
sys.getsizeof(pixels_obj)/10**6

1479.324672

In [36]:
deep_getsizeof(pixels_obj, set())/10**6

1479.326192

In [14]:
print(type(pixels_obj[0,0]))
sys.getsizeof(pixels_obj[0,0])

<class 'float'>


24

In [15]:
print(type(pixels_obj[0,4]))
sys.getsizeof(pixels_obj[0,4])

<class 'float'>


24