<a href="https://githubtocolab.com/giswqs/geemap/blob/master/examples/notebooks/46_local_rf_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

Uncomment the following line to install [geemap](https://geemap.org) if needed.

In [1]:
# !pip install geemap scikit-learn

# How to use locally trained machine learning models with GEE

This notebook illustrates how to train a random forest (or any other ensemble tree estimator) locally using scikit-learn, convert the estimator into a string representation that Earth Engine can interpret, and how to apply the machine learning model with EE. 

In [1]:
import ee
import geemap
import pandas as pd

from geemap import ml
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score, f1_score

In [2]:
geemap.ee_initialize()
ee.Authenticate()
ee.Initialize()

Enter verification code: 4/1AX4XfWhC688Dh9LOS6DrWAnjbZoDkGOXVGb39MYIr4krEwJWmGLjibjlVg0

Successfully saved authorization token.


## Train a model locally using scikit-learn

In this demo, we are going to use the training data from [here](https://github.com/giswqs/geemap/blob/master/examples/data/rf_example.csv). 

In [3]:
# read the feature table to train our RandomForest model
# data taken from ee.FeatureCollection('GOOGLE/EE/DEMOS/demo_landcover_labels')

url = "https://raw.githubusercontent.com/giswqs/geemap/master/examples/data/rf_example.csv"
df = pd.read_csv(url)

In [4]:
df

Unnamed: 0,B2,B3,B4,B5,B6,B7,landcover
0,0.139846,0.114738,0.109982,0.119542,0.125795,0.105720,0
1,0.130316,0.109207,0.107499,0.140210,0.132006,0.108497,0
2,0.146690,0.135766,0.146550,0.225686,0.218105,0.167111,0
3,0.119413,0.108924,0.105196,0.144868,0.159775,0.122056,0
4,0.155492,0.139932,0.137486,0.151377,0.153771,0.133134,0
...,...,...,...,...,...,...,...
93,0.117331,0.092176,0.062548,0.020362,0.005813,0.004047,2
94,0.118353,0.093785,0.060253,0.020083,0.007317,0.004719,2
95,0.123362,0.095831,0.069663,0.027320,0.011386,0.008357,2
96,0.122907,0.100083,0.079527,0.024564,0.008570,0.006321,2


In [5]:
# specify the names of the features (i.e. band names) and label
# feature names used to extract out features and define what bands

feature_names = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7']
label = "landcover"

In [6]:
# get the features and labels into separate variables
X = df[feature_names]
y = df[label]

In [7]:
# create a classifier and fit
n_trees = 100 
rf = ensemble.RandomForestClassifier(n_trees).fit(X,y)

In [8]:
#We perform hyper parameter tunning.
param_grid = {'n_estimators': [300, 500],
                     'max_depth':[50, 70],
                     'min_samples_split': [5, 10],
                     'min_samples_leaf': [2, 10]}
#inner loop for tuning the hyperparameters
cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#outer loop for testing on holdout set
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = ensemble.RandomForestClassifier()
scorer = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1_macro': make_scorer(f1_score, average = 'macro'),
           'f1_weighted': make_scorer(f1_score, average = 'weighted')}
clf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv=cv_inner, random_state=42, n_iter=5)
nested_score = cross_validate(clf, X=X, y=y, cv=cv_outer, scoring=scorer, return_estimator=True)



In [9]:
model = clf.fit(X,y)

In [10]:
#We print the optimal parameters for our model.
from pprint import pprint
pprint(model.best_estimator_.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 50,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [11]:
#We evaluate the performance of our model with the optimal parameters.
nested_score

{'fit_time': array([17.90048337, 17.88489127, 17.65965772, 18.19124794, 18.12558866]),
 'score_time': array([0.06671715, 0.07812881, 0.08868647, 0.06399179, 0.07067323]),
 'estimator': [RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
                     estimator=RandomForestClassifier(), n_iter=5,
                     param_distributions={'max_depth': [50, 70],
                                          'min_samples_leaf': [2, 10],
                                          'min_samples_split': [5, 10],
                                          'n_estimators': [300, 500]},
                     random_state=42),
  RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
                     estimator=RandomForestClassifier(), n_iter=5,
                     param_distributions={'max_depth': [50, 70],
                                          'min_samples_leaf': [2, 10],
                                          'min_samples_split

In [12]:
#We now fit our model with the optimum parameter to the data.
rf= ensemble.RandomForestClassifier(bootstrap= True,
ccp_alpha= 0.0,
class_weight= None,
criterion= 'gini',
max_depth= 50,
max_features= 'auto',
max_leaf_nodes= None,
max_samples= None,
 min_impurity_decrease= 0.0,
 min_samples_leaf= 2,
 min_samples_split= 5,
 min_weight_fraction_leaf= 0.0,
 n_estimators= 300,
 n_jobs= None,
 oob_score= False,
 random_state= None,
 verbose= 0,
 warm_start= False).fit(X,y)

## Convert a sklearn classifier object to a list of strings

In [13]:
# convert the estimator into a list of strings
# this function also works with the ensemble.ExtraTrees estimator
trees =  ml.rf_to_strings(rf,feature_names)

In [14]:
# print the first tree to see the result
print(trees[0])

1) root 57 9999 9999 (1.6179707257241982)
  2) B7 <= 0.019053 22 0.0000 2 *
  3) B7 > 0.019053 57 0.6589 2
    6) B7 <= 0.125081 35 0.4911 1
      12) B4 <= 0.102422 16 0.0000 1 *
      13) B4 > 0.102422 3 0.0000 0 *
    7) B7 > 0.125081 35 0.4911 1
      14) B2 <= 0.134185 3 0.0000 1 *
      15) B2 > 0.134185 13 0.0000 0 *



In [15]:
print(trees[1])

1) root 60 9999 9999 (1.7463389615768867)
  2) B5 <= 0.063316 23 0.0000 2 *
  3) B5 > 0.063316 60 0.6641 1
    6) B4 <= 0.078633 16 0.0000 1 *
    7) B4 > 0.078633 37 0.4942 1
      14) B6 <= 0.198045 11 0.0000 0 *
      15) B6 > 0.198045 21 0.2130 0
        30) B3 <= 0.120434 3 0.0000 1 *
        31) B3 > 0.120434 7 0.0000 0 *



In [16]:
# number of trees we converted should equal the number of trees we defined for the model
len(trees) == n_trees

False

## Convert sklearn classifier to GEE classifier

At this point you can take the list of strings and save them locally to avoid training again. However, we want to use the model with EE so we need to create an ee.Classifier and persist the data on ee for best results.

In [17]:
# create a ee classifier to use with ee objects from the trees
ee_classifier = ml.strings_to_classifier(trees)

In [18]:
# ee_classifier.getInfo()

## Classify image using GEE classifier

In [19]:
# Make a cloud-free Landsat 8 TOA composite (from raw imagery).
l8 = ee.ImageCollection('LANDSAT/LC08/C01/T1');

image = ee.Algorithms.Landsat.simpleComposite(
  collection= l8.filterDate('2018-01-01', '2018-12-31'),
  asFloat= True
)

In [20]:
# classify the image using the classifier we created from the local training
# note: here we select the feature_names from the image that way the classifier knows which bands to use
classified = image.select(feature_names).classify(ee_classifier)

In [21]:
# display results
legend_keys = ['vegetation', 'water', 'bare soil']
legend_colors = ['#008000', '#0000FF', '#FF0000', ]
Map = geemap.Map(center=(37.75,-122.25), zoom=11)

Map.addLayer(image,{"bands": ['B7', 'B5', 'B3'], "min":0.05, "max": 0.55, "gamma":1.5}, 'image')
Map.addLayer(classified, {"min": 0, "max": 2, "palette": ['red', 'green', 'blue']},'classification')
Map.add_legend(legend_keys=legend_keys, legend_colors=legend_colors, position='bottomright')

Map

Map(center=[37.75, -122.25], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox(child…

In [22]:
import folium
from folium import plugins
from IPython.display import Image
import geopandas as gpd
import json
print(folium.__version__)
from ipygee import*
import math
import pandas as pd
from tslearn.clustering import TimeSeriesKMeans
from tslearn.utils import to_time_series_dataset

0.12.1


In [23]:
months = ee.List.sequence(1,12)
years = ee.List.sequence(2018, 2019)
POI = ee.Geometry.Point([-122.2769, 37.7435])

In [24]:
MD_NDVI = ee.ImageCollection('LANDSAT/LC08/C01/T1_8DAY_NDVI').filterDate('2018-1-1','2019-12-31').filterBounds(POI).select('NDVI')
modis_ndvi = MD_NDVI.median().clip(POI)
mean_ndvi = MD_NDVI.mean().clip(POI)

In [25]:
def monthly(collection):
 img_coll = ee.ImageCollection([])
 for y in years.getInfo():
  for m in months.getInfo():
   filtered = collection.filter(ee.Filter.calendarRange(y,y,'year')).filter(ee.Filter.calendarRange(m, m, 'month'))
   filtered = filtered.median()
   img_coll = img_coll.merge(filtered.set('year', y).set('month', m).set('system:time_start',ee.Date.fromYMD(y, m, 1).getInfo()['value']))
 return img_coll
Monthly_MD = monthly(MD_NDVI)

In [26]:
Point_1 = ee.Geometry.Point([-122.2776, 37.8181])
MD_ndvi = chart.Image.series(**{'imageCollection': Monthly_MD,
'region': Point_1,
'reducer': ee.Reducer.mean(),
'scale': 250,
'xProperty': 'system:time_start'})
MD_ndvi.renderWidget(width='80%')

HTML(value='<embed src=data:image/svg+xml;charset=utf-8;base64,PD94bWwgdmVyc2lvbj0nMS4wJyBlbmNvZGluZz0ndXRmLTg…

In [27]:
Point_1 = ee.Geometry.Point([-122.5118, 37.8438])
MD_ndvi = chart.Image.series(**{'imageCollection': Monthly_MD,
'region': Point_1,
'reducer': ee.Reducer.mean(),
'scale': 500,
'xProperty': 'system:time_start'})
MD_ndvi.renderWidget(width='200%')

HTML(value='<embed src=data:image/svg+xml;charset=utf-8;base64,PD94bWwgdmVyc2lvbj0nMS4wJyBlbmNvZGluZz0ndXRmLTg…

In [28]:
Point_1 = ee.Geometry.Point([-122.2769, 37.7435])
MD_ndvi = chart.Image.series(**{'imageCollection': Monthly_MD,
'region': Point_1,
'reducer': ee.Reducer.mean(),
'scale': 250,
'xProperty': 'system:time_start'})
MD_ndvi.renderWidget(width='80%')

HTML(value='<embed src=data:image/svg+xml;charset=utf-8;base64,PD94bWwgdmVyc2lvbj0nMS4wJyBlbmNvZGluZz0ndXRmLTg…

In [29]:
chart.Image.series?

In [30]:
Point_2 = ee.Geometry.Point([-122.5118, 37.8438])
Point_1 = ee.Geometry.Point([-122.5118, 37.8438])
MD_ndvi = chart.Image.series(**{'imageCollection': Monthly_MD,
'region': Point_2, 
'reducer': ee.Reducer.mean(),
'scale': 250,
'xProperty': 'system:time_start'})
MD_ndvi.renderWidget(width='80%')

HTML(value='<embed src=data:image/svg+xml;charset=utf-8;base64,PD94bWwgdmVyc2lvbj0nMS4wJyBlbmNvZGluZz0ndXRmLTg…

## Save trees to the cloud

Now we have the strings in a format that ee can use, we want to save it for later use. There is a function to export a list of tree strings to a feature collection. The feature collection will have a pro

In [131]:
user_id = geemap.ee_user_id()
user_id

'users/mouliomngouh'

In [132]:
# specify asset id where to save trees
# be sure to change <user_name> to your ee user name
asset_id = user_id +  "/random_forest_strings_test"
asset_id

'users/mouliomngouh/random_forest_strings_test'

In [133]:
# kick off an export process so it will be saved to the ee asset
ml.export_trees_to_fc(trees,asset_id)

# this will kick off an export task, so wait a few minutes before moving on

In [100]:
# read the exported tree feature collection
rf_fc = ee.FeatureCollection(asset_id)

# convert it to a classifier, very similar to the `ml.trees_to_classifier` function
another_classifier = ml.fc_to_classifier(rf_fc)

# classify the image again but with the classifier from the persisted trees
classified = image.select(feature_names).classify(another_classifier)

## Save trees locally

In [101]:
import os
out_csv = os.path.expanduser("~/Downloads/trees.csv")

In [102]:
ml.trees_to_csv(trees, out_csv)

In [103]:
another_classifier = ml.csv_to_classifier(out_csv)

In [104]:
classified = image.select(feature_names).classify(another_classifier)

In [134]:
# display results
# we should get the exact same results as before
Map = geemap.Map(center=(37.75,-122.25), zoom=11)

Map.addLayer(image,{"bands": ['B7', 'B5', 'B3'], "min":0.05, "max": 0.55, "gamma":1.5}, 'image')
Map.addLayer(classified, {"min": 0, "max": 2, "palette": ['red', 'green', 'blue']},'classification')

Map

Map(center=[37.75, -122.25], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox(child…

In [135]:
param_grid = {'n_estimators': [300, 500],
                     'max_depth':[50, 70],
                     'min_samples_split': [5, 10],
                     'min_samples_leaf': [2, 10]}
# inner loop for tuning the hyperparameters
cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# outer loop for testing on holdout set
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
clf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv=cv_inner, random_state=42, n_iter=5)
nested_score = cross_validate(clf, X=X, y=y, cv=cv_outer, scoring=('recall', 'precision', 'roc_auc'), return_estimator=True)

Traceback (most recent call last):
  File "C:\Users\Jean Paul\.conda\envs\env_ishango\lib\site-packages\sklearn\model_selection\_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Jean Paul\.conda\envs\env_ishango\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\Jean Paul\.conda\envs\env_ishango\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\Jean Paul\.conda\envs\env_ishango\lib\site-packages\sklearn\metrics\_classification.py", line 1872, in recall_score
    _, r, _, _ = precision_recall_fscore_support(
  File "C:\Users\Jean Paul\.conda\envs\env_ishango\lib\site-packages\sklearn\metrics\_classification.py", line 1534, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_la

In [136]:
scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')

NameError: name 'sklearn' is not defined

In [147]:
scorer = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1_macro': make_scorer(f1_score, average = 'macro',
           'f1_weighted': make_scorer(f1_score, average = 'weighted')}

SyntaxError: positional argument follows keyword argument (Temp/ipykernel_2304/520681512.py, line 5)

In [None]:
#nested_score