# ML Training Notebook for Cultivated

Notebook to train ML model to discriminate between cultivated and natural vegetation. Text file with extracted data can be downloaded from: Model is trained using data extracted to a CSV file.

The version with all pixels can be downloaded from: https://rsg.pml.ac.uk/shared_files/dac/train_input_geomedian_tmad.txt.gz

As geomedian and mads are calculated separatly need to combine to a single file using:
```python
import numpy
input_data = numpy.loadtxt("geomedian_stats_2015.txt", skiprows=1)
input_data_mads = numpy.loadtxt("tmad_stats_2015.txt", skiprows=1)

combined_data = numpy.hstack((input_data, input_data_mads[:,1:]))

column_names = 'classnum blue green red nir swir1 swir2 BUI BSI NBI EVI NDWI MSAVI sdev edev bcdev'

numpy.savetxt("training_data_2015_geomedian_mads_poly_mean",
              combined_data,             
              header=column_names, comments='', fmt='%.4f')
```

A version using the mean value for each feature is in the same repo as this notebook.

In [1]:
import os
import pickle
import numpy
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

In [2]:
import pydotplus
from IPython.display import Image  
from sklearn.externals.six import StringIO  



In [3]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * numpy.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(numpy.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [4]:
class_names = {111: 'Cultivated Terrestrial Vegetated', 112: 'Natural Terrestrial Vegetated', 
 123: 'Cultivated Aquatic Vegetated', 124: 'Natural Aquatic Vegetated', 
 215:'Artificial Surface', 216:'Natural Surface', 227:'Artificial Water', 228:'Natural Water'}

In [5]:
# Set up working dir
working_dir = '/home/jovyan/LCCS/cultivated_classification_4'

In [6]:
# Read in text file
# As it takes a while first see if there is a pickled version from a previous run
pickled_model_input = os.path.join(working_dir, 'training_data_2015_geomedian_mads_poly_mean_numpy.npy')

if os.path.isfile(pickled_model_input):
    print('Loading pickled model input file')
    model_input = numpy.load(pickled_model_input)
else:
    print('Reading model input from text file...')
    model_input = numpy.loadtxt(os.path.join(working_dir, 'training_datatrim.txt'), skiprows=1)
    numpy.save(pickled_model_input, model_input)
    
# Headers are
# classnum blue green red nir swir1 swir2 BUI BSI NBI EVI NDWI MSAVI sdev edev bcdev
column_names = 'classnum blue green red nir swir1 swir2 sdev edev bcdev'.split()
# column_names = 'classnum BS_PC_10 PV_PC_10 NPV_PC_10 BS_PC_50 PV_PC_50 NPV_PC_50 BS_PC_90 PV_PC_90 NPV_PC_90 blue green red nir swir1 swir2 sdev edev bcdev'.split()

column_names_indices = {}

for col_num, var_name in enumerate(column_names):
    column_names_indices[var_name] = col_num
    
print("Input shape:",model_input.shape)

Reading model input from text file...


OSError: /home/jovyan/LCCS/cultivated_classification_4/training_datatrim.txt not found.

In [None]:
model_input

In [None]:
# Remove any nans
model_input = model_input[~numpy.isnan(model_input).any(axis=1)]
print("Cleaned input shape:", model_input.shape)

In [None]:
# Split into training and testing data, 80 % is used for training with 20 % held back for testing.
# Use class to provide similar distribution across classes
# in training and testing data
model_train, model_test = model_selection.train_test_split(model_input, stratify=model_input[:,0],
                                                           train_size=0.8, random_state=0)

print("Train shape:",model_train.shape)
print("Test shape:",model_test.shape)

## Baseline model

In [None]:
# Set up model
# model = RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=0, oob_score=True)

# model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#                        max_depth=20, max_features='auto', max_leaf_nodes=None,
#                        min_impurity_decrease=0.0, min_impurity_split=None,
#                        min_samples_leaf=1, min_samples_split=3,
#                        min_weight_fraction_leaf=0.0, n_estimators=300,
#                        n_jobs=-1, oob_score=True, random_state=None, verbose=0,
#                        warm_start=False)

# Last grid optimisation run run
model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

model_variables = ['blue', 'green','red', 'nir', 'swir1', 'swir2', 'sdev', 'edev']

model_col_indices = []

for model_var in model_variables:
    model_col_indices.append(column_names_indices[model_var])

In [None]:
# Train model
model.fit(model_train[:,model_col_indices], model_train[:,0])

In [None]:
# Test model using data held back for training
score = model.score(model_test[:,model_col_indices], model_test[:,0])
print("Accuracy: {:.03}".format(score))

In [None]:
# Variable importance
for var_name, var_importance in zip(model_variables, model.feature_importances_):
    print("{}: {:.04}".format(var_name, var_importance))

## Explore parameter tuning with Randomized Search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model = RandomForestClassifier(n_jobs=-1, verbose=0)

print('Parameters currently in use:\n')
print(model.get_params())

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in numpy.linspace(start = 200, stop = 300, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in numpy.linspace(10, 100, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
model_variables = ['red', 'nir', 'blue', 'green', 'swir1', 'swir2', 'sdev', 'edev']

model_col_indices = []

for model_var in model_variables:
    model_col_indices.append(column_names_indices[model_var])

In [None]:
model_rcv = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 80, cv = 3, verbose=2, random_state=42, n_jobs = -1)
classifier = model_rcv.fit(model_train[:,model_col_indices], model_train[:,0])

In [None]:
# classifier.best_params_
# {'n_estimators': 266,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 'auto',
#  'max_depth': 37,
#  'bootstrap': False}

In [None]:
base_model = RandomForestClassifier(n_estimators = 10, n_jobs=-1, verbose=0)
base_model.fit( model_train[:,model_col_indices], model_train[:,0])

In [None]:
best_random = model_rcv.best_estimator_
random_accuracy = evaluate(best_random, model_test[:,model_col_indices], model_test[:,0])

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
# Variable importance
for var_name, var_importance in zip(model_variables, classifier.feature_importances_):
    print("{}: {:.04}".format(var_name, var_importance))

## Detailed parameter tuning with Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
#     'bootstrap': [True],
    'max_depth': [25, 30, 35, 40, 45],
    'max_features': ['auto'],
    'min_samples_leaf': [1],
    'min_samples_split': [2, 3],
    'n_estimators': [300]
}

In [None]:
model_grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = -1)
model_grid.fit(model_train[:,model_col_indices], model_train[:,0])

In [None]:
print(model_grid.best_estimator_)
grid_accuracy = evaluate(model_grid.best_estimator_, model_test[:,model_col_indices], model_test[:,0])

## Model testing and assessment

In [None]:
# Test model using data held back for training
score = classifier_grid.score(model_test[:,model_col_indices], model_test[:,0])
print("Accuracy: {:.03}".format(score))

In [None]:
# Variable importance
for var_name, var_importance in zip(model_variables, classifier.feature_importances_):
    print("{}: {:.04}".format(var_name, var_importance))

## Save out model

In [None]:
ml_model_dict = {}

ml_model_dict['variables'] = model_variables
ml_model_dict['classes'] = {'Not natural terrestrial vegetation' : 111,
                            'Natural terrestrial vegetation ' : 112}
ml_model_dict['classifier'] = model

# Pickle model
with open(os.path.join(working_dir, 'model_pickle.pickle'), 'wb') as f:
    pickle.dump(ml_model_dict, f)

In [None]:
ml_model_dict