In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import time

import rasterio
import rioxarray as rioxr
import geopandas as gpd

import planetary_computer as pc

from shapely.geometry import Polygon

import data_sampling_workflow.utility as utility

In [2]:
import iceplant_detection_functions as ipf
import refactoring_modules as rm

In [None]:
itemid = 'ca_m_3411934_sw_11_060_20200521'
# Fields next to Goleta Slough
reduce_box = Polygon([[-119.8284196946,34.4162731913],
                       [-119.8101541026,34.4162731913],
                       [-119.8101541026,34.4353838099],
                       [-119.8284196946,34.4353838099],
                       [-119.8284196946,34.4162731913]])

reduce_box_crs="EPSG:4326"

In [None]:
item = utility.get_item_from_id(itemid)
item.datetime

In [None]:
type(item.datetime)

In [None]:
type(item.datetime.date())

In [None]:
rast = rm.rioxr_from_itemid(itemid)
rast

In [None]:
rast.attrs['datetime'] == item.datetime

In [None]:
type(rast)

In [None]:
rast.rio.crs

In [None]:
rast.rio.transform()

In [None]:
rast_small = rioxr_from_itemid(itemid, reduce_box, reduce_box_crs)
rast_small

In [None]:
rast_small.rio.crs

In [None]:
rast_small.rio.transform()

In [None]:
pixels = rm.raster_as_df(rast.to_numpy(),  ['r','g','b','nir'])

In [None]:
type(pixels)

In [None]:
# ***************************************************************************************************
# ***************************************************************************************************

def normalized_difference_index(df, *args):
    m = args[0]
    n = args[1]
    
    x = df.iloc[:, m].astype('int16')  
    y = df.iloc[:, n].astype('int16')
    return (x-y) / (x+y)

In [None]:
ndvi = normalized_difference_index(pixels,3,0)
ndvi

In [None]:
# ***************************************************************************************************
# ***************************************************************************************************

def feature_df_treshold(df, feature_name, thresh, keep_gr, func, *args):
    
    df[feature_name] = func(df, *args)
    
    if keep_gr == True:
        keep = df[df[feature_name] > thresh]
        deleted_indices = df[df[feature_name] <= thresh].index
    else : 
        keep = df[df[feature_name] < thresh]
        deleted_indices = df[df[feature_name] >= thresh].index
        
    deleted_indices = deleted_indices.to_numpy()
    
    return keep, deleted_indices

In [None]:
not_water, water_index = feature_df_treshold(pixels, 'ndwi', 0.3, False, normalized_difference_index, 1,3)

In [None]:
is_veg, non_veg_index = feature_df_treshold(not_water, 'ndvi', 0.05, True, normalized_difference_index, 3,0)

In [None]:
is_veg

In [None]:
water_index

In [None]:
non_veg_index

In [None]:
def indices_to_image(nrows, ncols, indices_list, values, back_value):
    # background, any pixel not in the union of indices will be given this value
    reconstruct = np.ones((nrows,ncols))*back_value 

    # TO DO: check indices list and values lengths are the same?
    for k in range(0,len(indices_list)):
        i = indices_list[k] / ncols
        i = i.astype(int)
        j = indices_list[k] % ncols
        reconstruct[i,j] = values[k]
    
    return reconstruct

In [None]:
reconstruct = indices_to_image(12500, 10580, [water_index, non_veg_index], [3,2], back_value=1)

In [None]:
utility.save_raster(reconstruct, 
                    os.getcwd()+'/trial.tif', 
                    (rast.shape[1],rast.shape[2]), 
                    1, 
                    rast.rio.crs, 
                    rast.rio.transform(), 'int16' )


In [None]:
# ***************************************************************************************************
# ***************************************************************************************************

def add_date_features(df, date): 
    kwargs = {'year' : date.year,
             'month' : date.month,
             'day_in_year' : utility.day_in_year(date.day, date.month, date.year)}
    
    return df.assign(**kwargs)

In [None]:
is_veg = add_date_features(pixels, rast.attrs['datetime'])

In [None]:
is_veg

In [None]:
# TO DO: figure out warning about setting values on copy

TIMING OLD VS NEW

In [None]:
# # ORIGINAL METHOD

# itemid = 'ca_m_3411934_sw_11_060_20200521'
# t0 = time.time()
# item = utility.get_item_from_id(itemid)

# df = ipf.features_over_aoi(item, 
#                            utility.get_raster_from_item(item).read([1,2,3,4]), 
#                            thresh=0.05)
# print('time to make features df: ', (time.time()-t0))
# # memory 2.14 GB
# # time to make features df:  13.830940961837769

In [None]:
# def add_spectral_features(df, ndwi_thresh, ndvi_thresh):
    
#     not_water, water_index = rm.feature_df_treshold(df, 
#                                              'ndwi', ndwi_thresh, False, 
#                                              rm.normalized_difference_index, 1,3)   
#     is_veg, not_veg_index = rm.feature_df_treshold(not_water, 
#                                                    'ndvi', ndvi_thresh, True, 
#                                                    rm.normalized_difference_index, 3,0)
#     return is_veg, water_index, not_veg_index


In [None]:
itemid = 'ca_m_3411934_sw_11_060_20200521'

t0 = time.time()
raster = rm.rioxr_from_itemid(itemid)

is_veg, water_index, not_veg_index = rm.add_spectral_features(df = rm.raster_as_df(raster.to_numpy(), ['r','g','b','nir']), 
                                                           ndwi_thresh = 0.3, 
                                                           ndvi_thresh = 0.05) 
is_veg.drop('ndwi', axis=1, inplace=True)


is_veg = rm.add_date_features(is_veg, raster.datetime)

print('time to make features df: ', (time.time()-t0))
# time to make features df:  19.167106866836548
# memory usage after: 3.43 GB

# SAVING MODEL

In [10]:
import model_prep_and_evals as mpe

In [3]:
# select features from r (Red band) to avg_lidar
# excludes x,y, pts_crs, aoi, naip_id, polygon_id, iceplant and lidar features
X_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'r':'day_in_year']

# select iceplant feature column
y_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'iceplant'] 

In [9]:
# select test features and labels
X_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'r':'day_in_year']
y_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'iceplant'] 

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
rfc = RandomForestClassifier(n_estimators = 100, 
                                             random_state = 42)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [11]:
predictions = rfc.predict(X_test)
mpe.print_threshold_metrics(y_test, predictions)


true negatives: 91504     false positives: 2336
false negatives: 2478     true positives: 50514

sensitivity (TP/P): 95.32 %
specificity (TN/N): 97.51 %
G-mean:  0.96

precision (TP/(TP+FP)): 95.58 %

MCC:  0.9288885758749532

F1-measure:  0.95452
F0.5-measure (min false positives):  0.95529
F2-measure (min false negatives)  :  0.95375

accuracy: 96.72 %



https://scikit-learn.org/stable/model_persistence.html
In order to rebuild a similar model with future versions of scikit-learn, additional metadata should be saved along the pickled model:

- The training data, e.g. a reference to an immutable snapshot
- The python source code used to generate the model
- The versions of scikit-learn and its dependencies
- The cross validation score obtained on the training data

In [6]:
from joblib import dump, load

In [7]:
dump(rfc, 'rfc_trial.joblib')

['rfc_trial.joblib']

In [12]:
rfc_reopen = load('rfc_trial.joblib') 

In [13]:
predictions_reopen = rfc_reopen.predict(X_test)
mpe.print_threshold_metrics(y_test, predictions_reopen)


true negatives: 91504     false positives: 2336
false negatives: 2478     true positives: 50514

sensitivity (TP/P): 95.32 %
specificity (TN/N): 97.51 %
G-mean:  0.96

precision (TP/(TP+FP)): 95.58 %

MCC:  0.9288885758749532

F1-measure:  0.95452
F0.5-measure (min false positives):  0.95529
F2-measure (min false negatives)  :  0.95375

accuracy: 96.72 %



In [None]:
## TRY DOING IT WITH PARALLEL POST-FIT

In [2]:
from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit
from joblib import dump, load
import model_prep_and_evals as mpe

In [3]:
# select train features and labels
X_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'r':'day_in_year']
y_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'iceplant'] 

# select test features and labels
X_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'r':'day_in_year']
y_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'iceplant'] 

In [4]:
rfc = ParallelPostFit(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 42))
rfc.fit(X_train, y_train)

ParallelPostFit(estimator=RandomForestClassifier(random_state=42))

In [5]:
#predictions = rfc.predict(X_test)
#mpe.print_threshold_metrics(y_test, predictions)

In [7]:
dump(rfc, 'rfc_trial.joblib')

['rfc_trial.joblib']

In [9]:
rfc_reopen = load('rfc_trial.joblib') 

In [10]:
predictions_reopen = rfc_reopen.predict(X_test)
mpe.print_threshold_metrics(y_test, predictions_reopen)


true negatives: 91504     false positives: 2336
false negatives: 2478     true positives: 50514

sensitivity (TP/P): 95.32 %
specificity (TN/N): 97.51 %
G-mean:  0.96

precision (TP/(TP+FP)): 95.58 %

MCC:  0.9288885758749532

F1-measure:  0.95452
F0.5-measure (min false positives):  0.95529
F2-measure (min false negatives)  :  0.95375

accuracy: 96.72 %



In [None]:
# SEEMS OK...