In [2]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [5]:
dataset = pd.read_csv('data/U.S. Midwest fields/us_midwest_dataset.csv')

labels = np.array(dataset['SOCc'])
features = np.array(dataset.drop('SOCc', axis = 1))

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    rf_model = RandomForestRegressor(n_estimators=n_estimators, 
                                     max_depth=max_depth,
                                     min_samples_split=min_samples_split,
                                     min_samples_leaf=min_samples_leaf,
                                     random_state=42)
    
    rf_model.fit(train_features, train_labels)

    score = rf_model.score(test_features, test_labels)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

rf = RandomForestRegressor(**study.best_params, random_state=42)
rf.fit(train_features, train_labels)

predictions = rf.predict(test_features)

mae = mean_absolute_error(test_labels, predictions)
r2 = r2_score(test_labels, predictions)

importances = list(rf.feature_importances_)

feature_list = list(dataset.columns)
feature_list.remove('SOCc')

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

print()
for pair in feature_importances:
    print('Variable: {:20} Importance: {}'.format(*pair))

[I 2023-11-28 18:31:56,448] A new study created in memory with name: no-name-386a7481-4f7c-421f-9987-bd9749dc86db
[I 2023-11-28 18:32:18,632] Trial 0 finished with value: 0.8009303199631158 and parameters: {'n_estimators': 388, 'max_depth': 18, 'min_samples_split': 6, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.8009303199631158.
[I 2023-11-28 18:32:29,659] Trial 1 finished with value: 0.8009245104794229 and parameters: {'n_estimators': 194, 'max_depth': 15, 'min_samples_split': 7, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.8009303199631158.
[I 2023-11-28 18:33:29,165] Trial 2 finished with value: 0.7968160219009758 and parameters: {'n_estimators': 1167, 'max_depth': 31, 'min_samples_split': 5, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8009303199631158.
[I 2023-11-28 18:33:57,545] Trial 3 finished with value: 0.7991508523450518 and parameters: {'n_estimators': 525, 'max_depth': 19, 'min_samples_split': 3, 'min_samples_leaf': 8}. Best is trial 0 with val


Variable: sample_depth_min     Importance: 0.59
Variable: X                    Importance: 0.07
Variable: BD                   Importance: 0.05
Variable: Y                    Importance: 0.04
Variable: ndvi_mean            Importance: 0.03
Variable: evi2_mean            Importance: 0.03
Variable: sample_depth_max     Importance: 0.02
Variable: evi_mean             Importance: 0.02
Variable: savi_mean            Importance: 0.02
Variable: gndvi_mean           Importance: 0.01
Variable: ndwi_mean            Importance: 0.01
Variable: dem                  Importance: 0.01
Variable: T_GRAVEL             Importance: 0.0
Variable: T_SAND               Importance: 0.0
Variable: T_SILT               Importance: 0.0
Variable: T_CLAY               Importance: 0.0
Variable: T_USDA_TEX_CLASS     Importance: 0.0
Variable: T_REF_BULK_DENSITY   Importance: 0.0
Variable: T_BULK_DENSITY       Importance: 0.0
Variable: T_OC                 Importance: 0.0
Variable: T_PH_H2O             Importance: 0.0


In [6]:
dataset = pd.read_csv('data/U.S. Midwest fields/us_midwest_dataset.csv')

labels = np.array(dataset['SOCc'])
features = np.array(dataset.drop('SOCc', axis = 1))

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 2, 32)

    xgboost = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth)
    xgboost.fit(train_features, train_labels)

    score = xgboost.score(test_features, test_labels)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

xgboost = XGBRegressor(**study.best_params)
xgboost.fit(train_features, train_labels)

predictions = xgboost.predict(test_features)

mae = mean_absolute_error(test_labels, predictions)
r2 = r2_score(test_labels, predictions)

importances = list(xgboost.feature_importances_)

feature_list = list(dataset.columns)
feature_list.remove('SOCc')

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

print()
for pair in feature_importances:
    print('Variable: {:20} Importance: {}'.format(*pair)) 

[I 2023-11-28 19:35:21,740] A new study created in memory with name: no-name-a78f321d-e686-48ef-8739-0d3b18077fd5
[I 2023-11-28 19:35:23,025] Trial 0 finished with value: 0.7660070880590163 and parameters: {'n_estimators': 392, 'max_depth': 20}. Best is trial 0 with value: 0.7660070880590163.
[I 2023-11-28 19:35:24,573] Trial 1 finished with value: 0.766007089366036 and parameters: {'n_estimators': 1755, 'max_depth': 20}. Best is trial 1 with value: 0.766007089366036.
[I 2023-11-28 19:35:27,471] Trial 2 finished with value: 0.7676509404468951 and parameters: {'n_estimators': 289, 'max_depth': 30}. Best is trial 2 with value: 0.7676509404468951.
[I 2023-11-28 19:35:27,809] Trial 3 finished with value: 0.8148277873405722 and parameters: {'n_estimators': 337, 'max_depth': 6}. Best is trial 3 with value: 0.8148277873405722.
[I 2023-11-28 19:35:30,899] Trial 4 finished with value: 0.7676509404468951 and parameters: {'n_estimators': 271, 'max_depth': 30}. Best is trial 3 with value: 0.814827


Variable: sample_depth_min     Importance: 0.6000000238418579
Variable: Y                    Importance: 0.07999999821186066
Variable: X                    Importance: 0.05000000074505806
Variable: dem                  Importance: 0.05000000074505806
Variable: precipipation        Importance: 0.03999999910593033
Variable: evi2_mean            Importance: 0.029999999329447746
Variable: savi_mean            Importance: 0.029999999329447746
Variable: sample_depth_max     Importance: 0.019999999552965164
Variable: BD                   Importance: 0.019999999552965164
Variable: ndvi_mean            Importance: 0.019999999552965164
Variable: evi_mean             Importance: 0.019999999552965164
Variable: gndvi_mean           Importance: 0.019999999552965164
Variable: ndwi_mean            Importance: 0.019999999552965164
Variable: T_GRAVEL             Importance: 0.0
Variable: T_SAND               Importance: 0.0
Variable: T_SILT               Importance: 0.0
Variable: T_CLAY               I