# Band Importance

The goal of the first experiment is to find the best combination of bands, which consist of Sentinel 2 Level-2A bands and derived indices for our problem. The experiment assumes the following:

- A composite raster is created from averaging across a 1 year long time window, starting with 1. April, 2017, thus covering all seasons. April of 2017 was the first whole month with Level-2A imagery and the closest to the recording dates.
- Random Forest from scikit-learn with default parameters is chosen as the regression model.
- RMSE (Root Mean Squared Error) is used as evaluation metric for the model

The experiment is conducted on all Level-2A bands accessible from Google Earth Engine as well as all indices derived from those offered by the [eemont](https://github.com/davemlz/eemont) package:

In [None]:
# List all available Sentinel 2 Level-2A bands
from ltm.data import list_bands

bands = list_bands()

bands

In [None]:
# List all available indices offered by eemont
from ltm.data import list_indices

indices = list_indices()

indices

We first create the raster for the study area and create a scree plot to show the explained variance across a range of principal components:

In [None]:
# Create a composite from a Sentinel image with all available bands and indices
from ltm.data import sentinel_composite
from datetime import datetime
from pathlib import Path

# Define the label path as well as the path to write the raster to
target_path = "../data/processed/target.tif"
data_path = "../data/processed/band_importance/data.tif"
Path(data_path).parent.mkdir(parents=True, exist_ok=True)

# Create the composite if it does not exist
if not Path(data_path).exists():
    sentinel_composite(
        target_path_from=target_path,
        data_path_to=data_path,
        time_window=(datetime(2017, 4, 1), datetime(2018, 4, 1)),
        indices=indices,
        sentinel_bands=bands,
    )

In [None]:
# Use science style for plots
import matplotlib.pyplot as plt
import scienceplots

plt.style.use("science")

In [None]:
# Draw the scree plot for the data
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from ltm.features import load_raster
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define the dataset
data = load_raster(data_path)
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

# Define PCA model to use
pca = PCA()
pca_fit = pca.fit(scaled_df)
PC_values = np.arange(pca.n_components_) + 1

# Plot
plt.plot(PC_values, np.cumsum(pca.explained_variance_ratio_))
plt.title("Scree Plot")
plt.xlabel("Principal Components")
plt.ylabel("Explained Variance")
plt.show()

The scree plot indicates that above 50 components there is no significant increase in explained variance. Now we perform recursive feature elimination to determine the best combination of bands and indices:

In [None]:
# Compute the scores for each step of the RFE
from ltm.data import split_band_name
from ltm.features import load_raster
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import make_scorer, root_mean_squared_error
from sklearn.model_selection import cross_validate

import pandas as pd
from tqdm.notebook import tqdm

# Check if the report is already available
band_importance_path = "../reports/band_importance.csv"
if not Path(band_importance_path).exists():
    Path(band_importance_path).parent.mkdir(parents=True, exist_ok=True)

    # Load the raster, label and drop rows with NaN label
    data = load_raster(data_path)
    target = load_raster(target_path)
    data, target = data[target.notna()], target[target.notna()]

    # Create the RFE object and rank each pixel
    rf = RandomForestRegressor(n_jobs=-1, random_state=42)
    rfe = RFE(estimator=rf, n_features_to_select=1, step=1, verbose=1)
    rfe.fit(data, target)

    # Define the scoring metrics
    scoring = {"Root Mean Squared Error": make_scorer(root_mean_squared_error)}

    # Get the features in reverse order of elimination
    ranking = rfe.ranking_
    feature_names = rfe.feature_names_in_
    order_index = ranking.argsort()
    ordered_features = feature_names[order_index]

    # Compute the score for each step of the RFE
    score_df = pd.DataFrame(columns=scoring.keys())
    for i in tqdm(range(ordered_features.shape[0])):
        # Fit the model with the current features
        curr_data = data[ordered_features[: i + 1]]
        random_forest = RandomForestRegressor(n_jobs=-1, random_state=42)
        scores = cross_validate(
            random_forest, curr_data, target, scoring=scoring, n_jobs=-1
        )

        # Store the score in the dataframe
        band_label = split_band_name(ordered_features[i])[1]
        score_df.loc[band_label] = [
            scores[f"test_{key}"].mean() for key in scoring.keys()
        ]

    # Save the scores to a CSV file
    score_df.to_csv(band_importance_path, index_label="Band")
    score_df.index.name = "Band"
else:
    # Load the scores from the CSV file
    score_df = pd.read_csv(band_importance_path, index_col=0)

In [None]:
# Plot the RFE results
from ltm.visualize import plot_report

# Replace the index with number of features
df = score_df.reset_index()
df.index = df.index + 1

# Plot
ax = plot_report(
    df,
    "RFE Result",
    "Features",
    "Score",
    label_rotation=90,
    figsize=(6, 3),
    categorical_x=False,
)

# Plot vertical line for the optimal number of features
optimal_features = df.idxmin().iloc[1]
ax.axvline(x=optimal_features, color="red", linestyle="--")
# TODO: proper legend

plt.show()

In [None]:
# List all bands up to the last band with an optimal score
from ltm.models import bands_from_importance

sentinel_bands, index_bands = bands_from_importance(band_importance_path)

sentinel_bands, index_bands