In [1]:
import pandas as pd
import numpy as np
import optuna
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
rf_errors = []
rf_metrics = []

dataset = pd.read_csv('./data/ds_canada/dataset.csv')

labels = np.array(dataset['SOC (%)'])
features = np.array(dataset.drop('SOC (%)', axis = 1))

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=42)
    rf_model.fit(train_features, train_labels)

    score = rf_model.score(test_features, test_labels)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

rf = RandomForestRegressor(**study.best_params, random_state=42)
rf.fit(train_features, train_labels)

predictions = rf.predict(test_features)

mae = mean_absolute_error(test_labels, predictions)
r2 = r2_score(test_labels, predictions)

rf_errors.append(mae)
rf_metrics.append(r2)

importances = list(rf.feature_importances_)

feature_list = list(dataset.columns)
feature_list.remove('SOC (%)')

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

print()

for pair in feature_importances:
    print('Variable: {:20} Importance: {}'.format(*pair))

[I 2023-11-20 10:13:00,422] A new study created in memory with name: no-name-94ae3e1a-5353-4de7-91b1-206738acaeed


[I 2023-11-20 10:13:07,690] Trial 0 finished with value: 0.4941189463543847 and parameters: {'n_estimators': 934, 'max_depth': 11, 'min_samples_split': 6, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.4941189463543847.
[I 2023-11-20 10:13:11,416] Trial 1 finished with value: 0.24620219131899468 and parameters: {'n_estimators': 1191, 'max_depth': 2, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.4941189463543847.
[I 2023-11-20 10:13:15,315] Trial 2 finished with value: 0.3537250174080141 and parameters: {'n_estimators': 941, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.4941189463543847.
[I 2023-11-20 10:13:30,099] Trial 3 finished with value: 0.5047190166438158 and parameters: {'n_estimators': 1732, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 6}. Best is trial 3 with value: 0.5047190166438158.
[I 2023-11-20 10:13:40,602] Trial 4 finished with value: 0.5011338857874601 and parameters:


Variable: VDepth               Importance: 0.41
Variable: ndvi_mean            Importance: 0.19
Variable: Elev                 Importance: 0.16
Variable: evi_mean             Importance: 0.09
Variable: evi2_mean            Importance: 0.06
Variable: savi_mean            Importance: 0.04
Variable: gndvi_mean           Importance: 0.03
Variable: ndwi_mean            Importance: 0.03



In [3]:
xgb_errors = []
xgb_metrics = []

dataset = pd.read_csv('./data/ds_canada/dataset.csv')

labels = np.array(dataset['SOC (%)'])
features = np.array(dataset.drop('SOC (%)', axis = 1))

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 2, 32)

    xgboost = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth)
    xgboost.fit(train_features, train_labels)

    score = xgboost.score(test_features, test_labels)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

xgboost = XGBRegressor(**study.best_params)
xgboost.fit(train_features, train_labels)

predictions = xgboost.predict(test_features)

mae = mean_absolute_error(test_labels, predictions)
r2 = r2_score(test_labels, predictions)

xgb_errors.append(mae)
xgb_metrics.append(r2)

importances = list(xgboost.feature_importances_)

feature_list = list(dataset.columns)
feature_list.remove('SOC (%)')

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

print()

for pair in feature_importances:
    print('Variable: {:20} Importance: {}'.format(*pair)) 

[I 2023-11-20 10:17:12,939] A new study created in memory with name: no-name-97d4c643-bdc4-4129-b586-c1216e2880ae


[I 2023-11-20 10:17:13,284] Trial 0 finished with value: 0.338896633288268 and parameters: {'n_estimators': 871, 'max_depth': 2}. Best is trial 0 with value: 0.338896633288268.
[I 2023-11-20 10:17:13,645] Trial 1 finished with value: 0.4445544245778854 and parameters: {'n_estimators': 1074, 'max_depth': 5}. Best is trial 1 with value: 0.4445544245778854.
[I 2023-11-20 10:17:13,776] Trial 2 finished with value: 0.37738944779906725 and parameters: {'n_estimators': 270, 'max_depth': 3}. Best is trial 1 with value: 0.4445544245778854.
[I 2023-11-20 10:17:14,526] Trial 3 finished with value: 0.38471409122405875 and parameters: {'n_estimators': 1299, 'max_depth': 19}. Best is trial 1 with value: 0.4445544245778854.
[I 2023-11-20 10:17:15,213] Trial 4 finished with value: 0.3982287560344 and parameters: {'n_estimators': 907, 'max_depth': 21}. Best is trial 1 with value: 0.4445544245778854.
[I 2023-11-20 10:17:15,807] Trial 5 finished with value: 0.3744950940232713 and parameters: {'n_estimato


Variable: VDepth               Importance: 0.2800000011920929
Variable: ndvi_mean            Importance: 0.15000000596046448
Variable: evi2_mean            Importance: 0.12999999523162842
Variable: Elev                 Importance: 0.10999999940395355
Variable: evi_mean             Importance: 0.10000000149011612
Variable: gndvi_mean           Importance: 0.07999999821186066
Variable: savi_mean            Importance: 0.07999999821186066
Variable: ndwi_mean            Importance: 0.05999999865889549


In [7]:
# plt.plot(range(2017, 2022), rf_errors)
# plt.plot(range(2017, 2022), xgb_errors)
# plt.legend(["random forest", "xgboost"], loc ="lower left")
# plt.title("Errors of the models per year") 
# plt.show() 

In [6]:
# plt.plot(range(2017, 2022), rf_metrics)
# plt.plot(range(2017, 2022), xgb_metrics)
# plt.legend(["random forest", "xgboost"], loc ="lower left")
# plt.title("Metrics of the models per year") 
# plt.show()