# Legit Stuff

In [None]:
# Create composite images from plot and Sentinel-2 data
from src.data import sentinel_composite
import pandas as pd

plot = pd.read_csv("../data/processed/plot.csv")

X_path, y_path = sentinel_composite(
    plot=plot,
    time_window=("2015-07-01", "2016-06-30"),
    num_composites=6,
    level_2a=False,
    temporal_reducers=["min", "max", "median", "variance", "mean", "skew"],
    indices=["NDVI", "NDWI"],
    # sentinel_bands=["B2", "B3", "B4"],
    areas_as_y=True,
)

In [None]:
# Interpolate missing values in X and save it as a new raster (optional)
from src.features import load_multi_band_raster, interpolate_X_and_bands, save_raster

X_path = "../data/processed/X.tif"
interpolated_X_path = "../data/processed/interpolated_X.tif"

X, band_names = load_multi_band_raster(X_path)
interpolated_X, band_names = interpolate_X_and_bands(X, band_names)
save_raster(interpolated_X, band_names, X_path, interpolated_X_path)

In [None]:
# Visualize time series for a specific temporal reducer
import rasterio
import numpy as np
import matplotlib.pyplot as plt

raster_path = interpolated_X_path
reducer = "Min"  # Important: title case

for i in range(1, 6 + 1):
    with rasterio.open(raster_path) as src:
        X = src.read()
        bands = src.descriptions
        r_band = f"{i} {reducer} B4"
        g_band = f"{i} {reducer} B3"
        b_band = f"{i} {reducer} B2"
        X = X[[bands.index(b) for b in [r_band, g_band, b_band]]]
        X = X.transpose(1, 2, 0)
        X /= np.nanmax(X)
        plt.imshow(X)
        plt.show()

In [None]:
# Load and preprocess data
from src.features import load_multi_band_raster, interpolate_X_and_bands, drop_nan, get_similarity_matrix, show_similarity_matrix, show_dendrogram
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.style.use("ltm.mplstyle")

# Load data
X_path = "../data/processed/interpolated_X.tif"
y_path = "../data/processed/y.tif"

X, band_names = load_multi_band_raster(X_path)
y, _ = load_multi_band_raster(y_path)

# Prevent warnings for single band y
if y.shape[1] == 1:
    y = y.ravel()

# Drop NaN, split data
X, band_names = interpolate_X_and_bands(X, band_names)
X, y = drop_nan(X, y) # TODO time series imputation (e.g. linear interpolation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  # stratified is not really possible, unless every band is binned...

# Preprocess for non-tree models
standard_scaler = StandardScaler().fit(X_train)
X_train = standard_scaler.transform(X_train)
X_test = standard_scaler.transform(X_test)

# TODO: Correlation/mutual information based dim red; TODO permutation importance
# similarity = get_similarity_matrix(X, band_names, "pearson")
# sm_ax = show_similarity_matrix(similarity)
# ddg_ax = show_dendrogram(similarity)

# dimred="mutual_info", threshold=0.1
# most_similar()  # https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html
# plot_pairplot() # Pairplot of most similar features
# # Idea: Hierarchial Dim Red based on correlation/mutual information with label band

In [None]:
# Train models
from skelm import ELMRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.metrics import (make_scorer, max_error, mean_squared_error,
                             median_absolute_error, r2_score)
from scipy.stats import randint, uniform, loguniform
from xgboost import XGBRegressor
from src.models import hyperparam_search, best_scores

# Define the models and hyperparameter search space with distributions: expon, gamma, uniform, loguniform or randint
search_space = {
    ELMRegressor(): {
        "alpha": loguniform(1e-8, 1e5),
        "include_original_features": [True, False],
        "n_neurons": loguniform(1, 100-1),
        "ufunc": ["tanh", "sigm", "relu", "lin"],
        "density": uniform(0.01, 0.99),
    },
    XGBRegressor(): {
        "n_estimators": randint(10, 100-10),
        "max_depth": randint(1, 20),
        "learning_rate": uniform(0.01, 0.5),
        "gamma": uniform(0, 0.5),
        "min_child_weight": randint(1, 11),
    },
    RandomForestRegressor(): {
        "n_estimators": randint(1, 100-1),
        "max_depth": randint(1, 20),
        "max_features": randint(1, 11),
        "min_samples_split": randint(2, 11),
        "min_samples_leaf": randint(1, 11),
        "bootstrap": [True, False],
        "criterion": ["squared_error", "absolute_error", "poisson", "friedman_mse"],
    },
    # ExtraTreesRegressor(): {
    #     "n_estimators": randint(1, 100-1),
    #     "min_impurity_decrease": loguniform(1e-5, 0.5),
    #     "criterion": ["squared_error", "absolute_error"],
    # },
}

# Define the scorers and refit (scorer for best model selection)
scoring = {
            # "max_error": make_scorer(max_error,
            #                                 greater_is_better=False),
           "median_absolute_error": make_scorer(median_absolute_error,
                                                greater_is_better=False),
           "mean_squared_error": make_scorer(mean_squared_error,
                                             greater_is_better=False),
           "coefficient of determination": make_scorer(r2_score,
                                   greater_is_better=False),}
refit = "mean_squared_error"

search_results = hyperparam_search(
    X_train,
    y_train,
    search_space,
    scoring,
    refit,
    kfold_from_endmembers=True,
    kfold_n_splits=5,
    kfold_n_iter=10,
    random_state=0,
)

# Serialize object
import dill as pickle # Necessary to serialize lambda functions

with open("search_results.pkl", "wb") as f:
    pickle.dump(search_results, f)

# Show scores
scores = best_scores(search_results, scoring)
scores["root_mean_squared_error"] = scores["mean_squared_error"]**0.5
scores

In [1]:
# Deserialize object
import dill as pickle # Necessary to serialize lambda functions

with open("search_results.pkl", "rb") as f:
    search_results = pickle.load(f)

X_path = "../data/processed/interpolated_X.tif"
y_path = "../data/processed/y.tif"

In [None]:
from src.models import cv_predict

orig_plot, pred_plots = cv_predict(
    search_results, 
    X_path, 
    y_path, 
    # rgb_bands=None,
    kfold_from_endmembers=False,
    random_state=0
)

In [None]:
search_results[2].best_estimator_.fit(X_train, y_train)

In [None]:
predicted_y = search_results[2].best_estimator_.predict(X_test)

In [None]:
broadleaf = y_test[:, 0]
conifer = y_test[:, 1]
broadleaf_share_test = broadleaf / (broadleaf + conifer)

broadleaf = np.maximum(predicted_y[:, 0], 0)
conifer = np.maximum(predicted_y[:, 1], 0)
broadleaf_share_pred = broadleaf / (broadleaf + conifer)

In [None]:
broadleaf_share_pred = predicted_y
broadleaf_share_test = y_test

In [None]:
coefficient_of_dermination = r2_score(broadleaf_share_test, broadleaf_share_pred)

In [None]:
coefficient_of_dermination  # 0.5776951683455496

In [None]:
mean_squared_error(broadleaf_share_test, broadleaf_share_pred)

In [None]:
import matplotlib.pyplot as plt

plt.plot(broadleaf_share_test, broadleaf_share_pred, "o", alpha=0.1)

In [None]:
import matplotlib.pyplot as plt

plt.plot(y_test, predicted_y, "o", alpha=0.1)
# plt.plot(y_test[:, 1], predicted_y[:, 1], "o", alpha=0.1)
# plt.plot(broadleaf_share_test, broadleaf_share_pred, "o", alpha=0.1)

In [None]:
# Plot original and predicted values, one plot per prediction
for pred_plot, results in zip(pred_plots, search_results):
    x = orig_plot.flatten()
    y = pred_plot.flatten()
    plt.plot(x, y, ".", alpha=0.1)
    plt.xlabel("Original")
    plt.ylabel(results.best_estimator_.__class__.__name__)
    plt.show()

In [None]:
# Impurity based feature importance (bad if high cardinality features with many unique values are present)
import numpy as np

forest = search_results[2].best_estimator_
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

import pandas as pd

forest_importances = pd.Series(importances, index=band_names)

fig, ax = plt.subplots(figsize=(10, 10))
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
# Permutation importance
from sklearn.inspection import permutation_importance

result = permutation_importance(
    search_results[2].best_estimator_, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

forest_importances = pd.Series(result.importances_mean, index=band_names)

fig, ax = plt.subplots(figsize=(10, 10))
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean score decrease")
fig.tight_layout()
plt.show()