# Together with PALSAR & other Level 2A bands

In [None]:
# Create a raster from a PALSAR image
from ltm.data import palsar_raster
from datetime import datetime
from pathlib import Path

X_path = "../data/processed/band_importance/X_palsar.tif"

if not Path(y_path).exists():
    start = datetime(2017, 4, 1)
    end = datetime(2018, 4, 1)
    middle = start + (end - start) / 2

    Path(X_path).parent.mkdir(parents=True, exist_ok=True)

    raster = palsar_raster(
        y_path_from=y_path,
        X_path_to=X_path,
        timestamp=middle,
    )

In [None]:
# Create a raster from a Sentinel image with all bands and indices
from ltm.data import sentinel_composite, list_bands, list_indices
from datetime import datetime
from pathlib import Path

y_path = "../data/processed/y.tif"
X_path = "../data/processed/band_importance/X_sentinel.tif"

if not Path(X_path).exists():
    sentinel_composite(
        y_path_from="../data/processed/y.tif",
        X_path_to=X_path,
        time_window=(datetime(2017, 4, 1), datetime(2018, 4, 1)),
        indices=list_indices(),
        sentinel_bands=list_bands(),
    )

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from ltm.features import load_raster
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define the dataset
X = load_raster("../data/processed/band_importance/X_sentinel.tif")
X_palsar = load_raster("../data/processed/band_importance/X_palsar.tif")
X = pd.concat([X, X_palsar], axis=1)

scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Define PCA model to use
pca = PCA()
pca_fit = pca.fit(scaled_df)
PC_values = np.arange(pca.n_components_) + 1

plt.plot(PC_values, np.cumsum(pca.explained_variance_ratio_))
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
# Compute the scores for each step of the RFE for PALSAR and Sentinel data
from ltm.features import load_raster, drop_nan_rows
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_validate

import pandas as pd
from tqdm import tqdm

band_importance_path = "../reports/band_importance.csv"
if not Path(band_importance_path).exists():
    # Load the raster, label and drop rows with NaN values
    X = load_raster("../data/processed/band_importance/X_sentinel.tif")
    y = load_raster("../data/processed/y.tif")
    X, y = drop_nan_rows(X, y)

    # Create the RFE object and rank each pixel
    rf = RandomForestRegressor(n_jobs=-1)
    rfe = RFE(estimator=rf, n_features_to_select=1, step=1, verbose=1)
    rfe.fit(X, y)

    scoring = {
        "R2 Score": make_scorer(r2_score),
        "Mean Absolute Error": make_scorer(mean_absolute_error),
        "Root Mean Squared Error": make_scorer(mean_squared_error, squared=False)
    }

    ranking = rfe.ranking_
    feature_names = rfe.feature_names_in_

    order_index = ranking.argsort()
    ordered_features = feature_names[order_index]
    score_df = pd.DataFrame(columns=scoring.keys())
    for i in tqdm(range(ordered_features.shape[0])):
        curr_X = X[ordered_features[:i+1]]
        random_forest = RandomForestRegressor(n_jobs=-1, random_state=42)
        scores = cross_validate(random_forest, curr_X, y, scoring=scoring, n_jobs=-1)
        score_df.loc[ordered_features[i]] = [scores[f"test_{key}"].mean() for key in scoring.keys()]

    score_df.to_csv(band_importance_path, index_label="Band")
else:
    score_df = pd.read_csv(band_importance_path, index_col=0)

score_df

In [None]:
import matplotlib.pyplot as plt
import scienceplots

plt.style.use('science')

In [None]:
from ltm.models import plot_report

ax = plot_report(
    score_df,
    "RFE Results",
    "Feature",
    "Score",
    label_rotation=90,
    replace_labels={"R2 Score": "$R^2$ Score"},
    figsize=(8, 2),
)

plt.show()

In [None]:
score_df["R2 Score"].idxmax(), score_df["Root Mean "].idxmax(), score_df["R2 Score"].idxmax()

# Idx 21 is best

In [None]:
list(score_df.index).index(score_df["R2 Score"].idxmax())

# Scree Plot

# Partial Autocorrelation inspired correlation

partial autocorrelation: x(t) and x(t-1) are correlated, as well as x(t-1) and x(t-2). x(t) and x(t-2) are also, but how much of that is due to x(t-1) and x(t-2) being correlated? Partial autocorrelation removes the effect of x(t-1) and x(t-2) on x(t) and x(t-2) respectively, and then measures the correlation between x(t) and x(t-2), "cleaned" with correlation with x(t-1).