# Random Forest classification of LC from Semnatically segmented pictures

With a sample of 1120 photos covering eight major LC types over the European Union. 

We applied semantic segmentation on these photos using a neural network trained with the ADE20k dataset. 

For each photo, we extracted the original LC identified, the segmented classes, and the pixel count for each class.

Using the latter as input features, we trained a Random Forest to classify the LC.

The results show a mean F1-score of 89\%, increasing to 93\% when the Wetland class is not considered. 


In [None]:
#imports
import pandas as pd
import matplotlib.pyplot as plt 
from PIL import Image 

import os, glob
import sys
sys.path.append("./git")
import RF_utils as utils

from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, make_scorer, precision_recall_fscore_support
from sklearn.utils.multiclass import unique_labels
from sklearn import tree

import numpy as np

import pickle

import ast


#### Set the paths.


gt_path is the path to a csv file containing all the pixel count per class of the images used in the dataset

In [None]:
project_path = '/eos/jeodpp/data/projects/REFOCUS/data'

In [None]:
Path_to_resnets = '{}/LandCOverCLass/outputs/'.format(project_path)
gt_path = '{}/LandCOverCLass/inputs/Final-dataset-140xclass.csv'.format(project_path)
out_path = '{}/LandCOverCLass/outputs/'.format(project_path)
path_images = '{}/data/LandCOverCLass/inputs/dataset/'.format(project_path)
path_plots = '{}/LandCOverCLass/Plots/'.format(project_path)


In [None]:
#name of the class label in the cvs
class_gt = 'lc1'

In [None]:
#read the dataset
data = pd.read_csv(gt_path)


In [None]:
#to set in another place:

THRESH = 0

## Spatial histogram of the ditribution of the dataset by LC class

In [None]:
names = {'A':"Artificial Land", 'B':"Cropland", 'C':"Woodland", 'D':"Shrubland",
                       'E':"Grassland", 'F':"Bare soil", 'G':"Water areas", 'H':"Wetland"}

df = data['lc1'].value_counts()

# Replace the letters with the labels
df.index = df.index.map(names)

# Plot the bar chart
ax = df.plot.bar(x='land_type', y='values')
ax.set_xlabel('Land Type')
ax.set_ylabel('Values')


### Check what's the best ResNet(trained with ADE20K) used for inferencing for the Random Forest classifier

Since we use in inference several backbones of the Deeplav V3 + (see https://github.com/dmlc/gluon-cv)

We run a tune parameters in the random forest also takinf the backbones as a parameter to search

In [None]:

files = glob.glob('{}test*.csv'.format(Path_to_resnets))

#Create csv to save all data
print(files)
hypertune = pd.DataFrame(columns=['model', 'best-params', 'acc-train', 'acc-test','Precision', 'Recall', 'F_score'])
gt = pd.read_csv(gt_path)
THRESH = 0

train, test = utils.split_dataset(gt, class_gt, 'Dataset_partition', out_path)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
          
for f in files:
    print('----------------{}--------------'.format(f))
    results = pd.read_csv(f)
    model = str(f).split('/')[-1]
    results = utils.erase_empty_cols(results)
    X_train, X_test, y_train, y_test = utils.prepare_set(train, test, results)
    

    # Random Forest
    rf_param = { 
        'n_estimators': [50, 100, 150, 200, 300],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [ 20, 50, 75, 100,125,150],
        'min_samples_leaf': [1, 2, 4,8,10,12,14,16,18,20],
        'min_samples_split' : [2, 5,8,10,13,15,18,20],
        'criterion' :['gini', 'entropy'],
        'bootstrap': [True, False],
    }

    max_depth = [int(x) for x in np.linspace(10, 110, num = 50)]
    max_depth.append(None)
    rf_dist = {'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 2000, num = 50)],
                 'max_features':  ['auto', 'sqrt', 'log2'],
                 'max_depth': max_depth,
                 'min_samples_split': [int(x) for x in np.linspace(start = 1, stop = 50, num = 50)],
                 'min_samples_leaf': [int(x) for x in np.linspace(start = 1, stop = 50, num = 50)],
                 'bootstrap': [True, False],
                 'criterion' :['gini', 'entropy']}


    rf_grid = utils.search_params(RandomForestClassifier(), rf_param, "grid", X_train, y_train, cv)
    
    clf = RandomForestClassifier(**rf_grid.best_params_).fit(X_train, y_train)
    
    scores = cross_validate(clf, X_test, y_test, cv=5, scoring=('accuracy','precision_macro', 'recall_macro','f1_macro'))
    
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std()))
    print("%0.2f precision_macro with a standard deviation of %0.2f" % (scores['test_precision_macro'].mean(), scores['test_precision_macro'].std()))
    print("%0.2f recall_macro with a standard deviation of %0.2f" % (scores['test_recall_macro'].mean(), scores['test_recall_macro'].std()))
    print("%0.2f f1_macro with a standard deviation of %0.2f" % (scores['test_f1_macro'].mean(), scores['test_f1_macro'].std()))


    hypertune = hypertune.append({'model': model, 'best-params': rf_grid.best_params_, 'acc-train': rf_grid.best_score_, \
                                        'acc-test':scores['test_accuracy'].mean(), 'Precision': scores['test_precision_macro'].mean(), \
                                        'Recall': scores['test_recall_macro'].mean(), 'F_score': scores['test_f1_macro'].mean()}, ignore_index=True)

    hypertune.to_csv('{}/hypertune.csv'.format(out_path))




Check which is the best model and the parameters selected

In [None]:
hypertune = pd.read_csv('{}/hypertune.csv'.format(out_path))
model_sel = hypertune.sort_values(by=['acc-train', 'F_score'], ascending=False).head(1)

print(model_sel)

In [None]:
#path to the backbone that works best
print( model_sel['model'].item())

final_path_results = '{}/{}'.format(Path_to_resnets, model_sel['model'].item())
results = pd.read_csv(final_path_results)
results = results.loc[:, ~results.columns.str.contains('^Unnamed')]


## Final metrics of the bests model With/Without Wetlands

Once we have the parameters from the tunning proces, we run on the test set the acuracy final metrics,taking into account the Wetland class and also excluding it.

In [None]:
#path to the backbone that works best

print(model_sel['model'].item())
print(model_sel['best-params'].item())

final_path_results = '{}{}'.format(Path_to_resnets, model_sel['model'].item())


#### Performance metrics for all classes

In [None]:
plt.rcParams["figure.figsize"] = (15,5)

gt = pd.read_csv(gt_path)
results = pd.read_csv(final_path_results)

with open('{}Dataset_partition.pickle'.format(out_path), "rb") as input_file:    
    e = pickle.load(input_file)          
train = e[0]
test= e[1]


hypertune = pd.read_csv('{}/hypertune.csv'.format(out_path))

f = '{}{}'.format(Path_to_resnets, model_sel['model'].item())
print('----------------{}--------------'.format(f))
results = pd.read_csv(f)
model = str(f).split('/')[-1]
results = utils.erase_empty_cols(results)
X_train, X_test, y_train, y_test = utils.prepare_set(train, test, results)


params = ast.literal_eval(model_sel['best-params'].item())
print(params)
clf = RandomForestClassifier(**params)

print('----------------ALL CLASSES FINAL PERFoRMANCE--------------')
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
labels = unique_labels(y_test, y_pred)
print(classification_report(y_test, y_pred, target_names=labels))
print(precision_recall_fscore_support(y_test, y_pred, average='macro'))

cfmatrix = confusion_matrix(y_true=y_test, y_pred = y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cfmatrix,
                              display_labels=labels)
disp.plot()
plt.show()
disp.figure_.savefig('{}CM-Allclasses.jpg'.format(path_plots))


#### Performance metrics without the Wetlands classes

In [None]:
"""Filter class Wetlands"""

plt.rcParams["figure.figsize"] = (15,5)

gt = pd.read_csv(gt_path)
results = pd.read_csv(final_path_results)

with open('{}Dataset_partition.pickle'.format(out_path), "rb") as input_file:    
    e = pickle.load(input_file)
          
train = e[0]
test= e[1]

hypertune = pd.read_csv('{}/hypertune.csv'.format(out_path))

f = '{}{}'.format(Path_to_resnets, model_sel['model'].item())
print('----------------{}--------------'.format(f))
results = pd.read_csv(f)
model = str(f).split('/')[-1]
results = utils.erase_empty_cols(results)
X_train, X_test, y_train, y_test = utils.prepare_set(train, test, results, wet = True)

params = ast.literal_eval(model_sel['best-params'].item())

clf = RandomForestClassifier(**params)

print('---------------- CLASSES WITHOUT WETLAND FINAL PERFoRMANCE--------------')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)
labels = unique_labels(y_test, y_pred)

with open('{}Predictions-noWetlands.pickle'.format(out_path), 'wb') as handle:
    pickle.dump([y_pred, y_proba, labels, y_test], handle, protocol=pickle.HIGHEST_PROTOCOL)

print(classification_report(y_test, y_pred, target_names=labels))
print(precision_recall_fscore_support(y_test, y_pred, average='macro'))

cfmatrix = confusion_matrix(y_true=y_test, y_pred = y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cfmatrix,
                              display_labels=labels)
disp.plot()
plt.show()
disp.figure_.savefig('{}CM-NoWetlands.jpg'.format(path_plots))


### Some Plots of the results

Show one of the trees in the Forest

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (12,12), dpi=800)
tree.plot_tree(clf.estimators_[10],
               feature_names = results.columns, 
               class_names=labels,
               filled = True);
fig.savefig('{}rf_individualtree.png'.format(path_plots))

SanKey diagram showing the relation between the LUCAS LC Classes and the ADE20K classes

In [None]:
results = pd.read_csv(final_path_results)
field = 'lc'

aux = results.iloc[:,:-2]
T = 1920000 * THRESH
aux[aux < T] = 0
aux = aux.where(aux == 0, 1)
df_unary = pd.concat([aux, results.iloc[:,-2:]], axis = 1)

#plot
utils.plot_relation_feat_LC(df_unary)


Plot Semantic segmentation with Deeplabv3+ trained with the ADE20k datasetand a pie diagram showing the share of each class detected

In [None]:
pie_class_plot(path_plots, results, path_images)

Plot showing the LUCAS point photo distribution by pixel area of the ADE20k objects segmented and the probability output by the Random Forest for each LC class.

In [None]:
plot_falsep(path_images, f, proba, test, pred)