Inspired by https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html (preprocessing, hyperparameter optimization) and using bayesian-optimization for hyperparameter optimization. Pipelines are heavily used and saved using dill instead of pickle.

# TODO: Optuna - A Hyperparameter Optimization Framework

In [2]:
# Create a baseline composite from a Sentinel image with the average across 1 year
from ltm.data import sentinel_composite, list_bands
from datetime import datetime
from pathlib import Path
from sklearn.metrics import make_scorer, mean_squared_error
from ltm.features import load_raster, drop_nan_rows
from sklearn.model_selection import cross_validate

rmse_scorer = make_scorer(mean_squared_error, squared=False, greater_is_better=False)
X_path = "../data/processed/hyperparameter_tuning/X.tif"
y_path = "../data/processed/ground_truth/y.tif"

# Create the composite if it does not exist
if not Path(X_path).exists():
    # List all available Sentinel 2 Level-2A bands
    bands = list_bands()
    b_bands = [band for band in bands if band.startswith("B")]

    Path(X_path).parent.mkdir(parents=True, exist_ok=True)
    sentinel_composite(
        y_path_from=y_path,
        X_path_to=X_path,
        time_window=(datetime(2017, 4, 1), datetime(2018, 4, 1)),
        sentinel_bands=b_bands,
    )

# Compare it to the baseline
from sklearn.ensemble import RandomForestRegressor

X = load_raster(X_path)
y = load_raster(y_path)

X = X.dropna(axis=1)
X, y = drop_nan_rows(X, y)

cv_result = cross_validate(RandomForestRegressor(n_jobs=-1, random_state=42), X, y, cv=5, scoring=rmse_scorer, n_jobs=-1)
score = cv_result["test_score"].mean()

print(f"Baseline RMSE: {-score}")

Baseline RMSE: 0.30346329731474664


In [None]:
import matplotlib.pyplot as plt
import scienceplots

plt.style.use('science')

In [None]:
# Define the metric and load the data
from sklearn.metrics import make_scorer, mean_squared_error
from ltm.features import load_raster, drop_nan_rows

# Define the metric
rmse_scorer = make_scorer(mean_squared_error, squared=False, greater_is_better=False)

# Load the data
X_path = "../data/processed/reducer_composites/X.tif"
y_path = "../data/processed/ground_truth/y.tif"
X = load_raster(X_path)
y = load_raster(y_path)

# Remove NaN columns from X and drop rows with NaN in y
X = X.dropna(axis=1)
X, y = drop_nan_rows(X, y)

In [None]:
# Create save folder and wrapper functions
from pathlib import Path

save_folder = "../models/"
Path(save_folder).mkdir(parents=True, exist_ok=True)

def suggest_categorical(*args, **kwargs):
    return "suggest_categorical", args, kwargs

def suggest_discrete_uniform(*args, **kwargs):
    return "suggest_discrete_uniform", args, kwargs

def suggest_float(*args, **kwargs):
    return "suggest_float", args, kwargs

def suggest_int(*args, **kwargs):
    return "suggest_int", args, kwargs

In [None]:
from skelm import ELMRegressor
from ltm.models import hyperparam_search

elm_default = ELMRegressor(random_state=42)
search_space = [
    suggest_float("alpha", 1e-8, 1e5, log=True),
    suggest_categorical("include_original_features", [True, False]),
    suggest_float("n_neurons", 1, 1000),
    suggest_categorical("ufunc", ["tanh", "sigm", "relu", "lin"]),
    suggest_float("density", 0.01, 0.99),
]

elm_model, elm_study = hyperparam_search(
    elm_default,
    search_space,
    X,
    y,
    rmse_scorer,
    n_trials=500,
    save_folder=save_folder,
    random_state=42
)
elm_model

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from ltm.models import hyperparam_search

et_default = ExtraTreesRegressor(n_jobs=-1, random_state=42)
search_space = [
    suggest_int("n_estimators", 1, 200),
    suggest_float("min_impurity_decrease", 1e-5, 0.5, log=True),
    suggest_categorical("criterion", ["squared_error", "absolute_error"]),
]

et_model, et_study = hyperparam_search(
    et_default,
    search_space,
    X,
    y,
    rmse_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42
)
et_model

In [None]:
# HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from ltm.models import hyperparam_search

hgbr_default = HistGradientBoostingRegressor(random_state=42)
search_space = [
    suggest_int("max_iter", 100, 1000),
    suggest_float("learning_rate", 0.001, 0.5, log=True),
    suggest_int("max_leaf_nodes", 2, 1000),
    suggest_categorical("l2_regularization", [0, 1e-10, 1e-5, 1e-3, 1e-1, 1]),
]

hgbr_model, hgbr_study = hyperparam_search(
    hgbr_default,
    search_space,
    X,
    y,
    rmse_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42
)
hgbr_model

In [None]:
# K-Nearest Neighbour
from sklearn.neighbors import KNeighborsRegressor
from ltm.models import hyperparam_search

knn_default = KNeighborsRegressor(n_jobs=-1)
search_space = [
    suggest_int("n_neighbors", 1, 100),
    suggest_categorical("weights", ["uniform", "distance"]),
    suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
]

knn_model, knn_study = hyperparam_search(
    knn_default,
    search_space,
    X,
    y,
    rmse_scorer,
    n_trials=500,
    save_folder=save_folder,
    random_state=42
)
knn_model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from ltm.models import hyperparam_search
from sklearn.model_selection import cross_validate

rf_default = RandomForestRegressor(n_jobs=-1, random_state=42)
search_space = [
    suggest_int("n_estimators", 1, 200),
    suggest_int("max_depth", 1, 1000),
    suggest_float("max_features", 0.1, 1.0),
    suggest_float("min_samples_split", 1e-5, 0.5, log=True),
    suggest_float("min_samples_leaf", 1e-5, 0.5, log=True),
    suggest_categorical("bootstrap", [True, False]),
    suggest_categorical("criterion", ["squared_error", "absolute_error", "poisson", "friedman_mse"]),
]

rf_model, rf_study = hyperparam_search(
    rf_default,
    search_space,
    X,
    y,
    rmse_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42
)
rf_model

In [None]:
# SGD Linear Regression
from sklearn.linear_model import SGDRegressor
from ltm.models import hyperparam_search

sgd_default = SGDRegressor(random_state=42)
search_space = [
    suggest_categorical("loss", ["squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"]),
    suggest_float("alpha", 1e-6, 1e5, log=True),
    suggest_float("l1_ratio", 0, 1),
]

sgd_model, sgd_study = hyperparam_search(
    sgd_default,
    search_space,
    X,
    y,
    rmse_scorer,
    n_trials=500,
    save_folder=save_folder,
    random_state=42
)
sgd_model

In [None]:
# Support Vector Machine
from sklearn.svm import SVR
from ltm.models import hyperparam_search

svr_default = SVR()
search_space = [
    suggest_float("C", 1e-5, 1e5, log=True),
    suggest_float("epsilon", 1e-5, 1e5, log=True),
    suggest_categorical("kernel", ["poly", "rbf", "sigmoid"]),
]

svr_model, svr_study = hyperparam_search(
    svr_default,
    search_space,
    X,
    y,
    rmse_scorer,
    n_trials=500,
    save_folder=save_folder,
    random_state=42
)
svr_model

In [None]:
from xgboost import XGBRegressor
from ltm.models import hyperparam_search

xgb_default = XGBRegressor(n_jobs=-1, random_state=42)
search_space = [
    suggest_int("n_estimators", 10, 200),
    suggest_int("max_depth", 1, 20),
    suggest_float("learning_rate", 0.001, 0.5, log=True),
    suggest_float("gamma", 0, 0.5),
    suggest_int("min_child_weight", 1, 11),
]

xgb_model, xgb_study = hyperparam_search(
    xgb_default,
    search_space,
    X,
    y,
    rmse_scorer,
    n_trials=100,
    save_folder=save_folder,
    random_state=42
)
xgb_model

In [None]:
import pandas as pd

csv_path = "../reports/hyperparameter_tuning.csv"
studies = [
    elm_study,
    et_study,
    hgbr_study,
    knn_study,
    rf_study,
    sgd_study,
    svr_study,
    xgb_study,
]

if not Path(csv_path).exists():
    default_models = [
        elm_default,
        et_default,
        hgbr_default,
        knn_default,
        rf_default,
        sgd_default,
        svr_default,
        xgb_default,
    ]
    models = [
        elm_model,
        et_model,
        hgbr_model,
        knn_model,
        rf_model,
        sgd_model,
        svr_model,
        xgb_model
    ]
    max_scores = {
        study.study_name: -study.best_value for study in studies
    }

    # Get the scores for default models
    model_names = [model.__class__.__name__ for model in default_models]
    scores = [-cross_validate(model, X, y, scoring=rmse_scorer, n_jobs=-1)["test_score"].mean() for model in default_models]

    df = pd.DataFrame({
        "Model": model_names,
        "Default Score": scores,
        "Best Score": [max_scores[model_name] for model_name in model_names]
    })
    df.set_index("Model", inplace=True)
    df.to_csv(csv_path)
else:
    df = pd.read_csv(csv_path, index_col="Model")

df

In [None]:
df.loc["SVR"]["Default Score"]

In [None]:
# Plot the hyperparam search scores during search with performance of baseline models as reference
for study in studies:
    trials = study.trials_dataframe()
    plt.figure(figsize=(10, 6))
    plt.plot(-trials['value'], label='Score')
    plt.title(study.study_name)
    plt.xlabel("Trial")
    plt.ylabel("RMSE Score")
    plt.axhline(df.loc[study.study_name]["Best Score"], color='g', linestyle='--', label='Best score')
    plt.axhline(df.loc[study.study_name]["Default Score"], color='r', linestyle='--', label='Default score')
    plt.ylim(0, 1)
    plt.legend()
    plt.show()