# Reducers Clustering

This experiment continues the second experiment. After determining the individual performance of reducers on a varying number of composites. Now we will determine the best combination of reducers for a fix number of composites.

In [1]:
# List the bands determined by the previous experiment
from ltm.models import bands_from_importance

sentinel_bands, index_bands = bands_from_importance("../reports/band_importance.csv")

Initializing Earth Engine API...


In [2]:
from ltm.data import sentinel_composite
from datetime import datetime

temporal_reducers = [
    "median",
    "mean",
    "mode",
    "min",
    "max",
    "sampleVariance",
]

sentinel_composite(
    y_path_from="../data/processed/y.tif",
    X_path_to="../data/processed/TMP_X_mega.tif",
    time_window=(datetime(2017, 4, 1), datetime(2018, 4, 1)),
    num_composites=7,  # 8, 11 is too much for GEE...
    temporal_reducers=temporal_reducers,
    indices=index_bands,
    sentinel_bands=sentinel_bands,
)

Preparing Sentinel-2 data...
Computing data...
GeoTIFF saved as ../data/processed/TMP_X_mega.tif


'../data/processed/TMP_X_mega.tif'

In [2]:
# Define auto_kmeans for finding the ideal number of clusters
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import os
import platform
import warnings

def auto_kmeans(data, n_clusters_min=2, n_clusters_max=10, random_state=42):
    if n_clusters_max < n_clusters_min:
        raise ValueError("n_clusters_max must not be smaller than n_clusters_max")
    
    # Avoid memory leak on windows
    if platform.system() == "Windows":
        os.environ["OMP_NUM_THREADS"] = "1"
    
    best_score = float("-inf")
    best_kmeans = None
    for n_clusters in range(n_clusters_min, n_clusters_max + 1):
        kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=random_state)

        # Suppress memory leak on windows warning
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            labels = kmeans.fit_predict(data)
        s_score = silhouette_score(data, labels)

        if s_score > best_score:
            best_score = s_score
            best_kmeans = kmeans

    if best_kmeans is None:
        raise ValueError("This error should not occur... best_kmeans should at least always be set to the first kmeans")
    
    if platform.system() == "Windows":
        os.environ.pop("OMP_NUM_THREADS")
    
    return best_kmeans

In [3]:
# Choose the 1,...,n-1 best reducer clusters for each metric and number of composites
import pandas as pd
import numpy as np
from datetime import datetime

# Create dictionary whether a scorer is a loss, a.k.a. less is better
is_loss_dict = {
    "R2 Score": False,
    "Mean Absolute Error": True,
    "Root Mean Squared Error": True,
}

# Create one DataFrame per Metric with reducers as index and number of composites as columns
df = pd.read_csv("../reports/reducer_composites.csv")
dfs = {}
for metric in is_loss_dict.keys():
    new_df = pd.DataFrame(index=df["Composites"].unique()[::-1])
    
    grouped = df[["Reducer", "Composites", metric]].groupby("Reducer")
    for name, group in grouped:
        group = group.sort_values("Composites")
        new_df[name] = np.asarray(group[metric])

    dfs[metric] = new_df.T

list_of_kwargs = []
for metric, is_loss in is_loss_dict.items():
    df = dfs[metric]
    df.dropna(inplace=True)

    for num_composites in df.columns:
        composite_df = df[[num_composites]]

        # Compute the kmeans with best silhouette score
        kmeans = auto_kmeans(composite_df)
        labels = kmeans.predict(composite_df)
        n_clusters = kmeans.n_clusters
        cluster_centers = np.array(kmeans.cluster_centers_[:, 0])

        # Sort the clusters according to the metric
        sorted_indices = np.argsort(cluster_centers)
        if not is_loss:
            sorted_indices = sorted_indices[::-1]
        
        # Create parameter configs for the 1,...,n-1 best clusters 
        for max_idx in range(1, len(sorted_indices)):
            valid_centers = sorted_indices[:max_idx]
            valid_reducers = [reducer for reducer, label
                              in zip(composite_df.index, labels)
                              if label in valid_centers]
            
            kwargs = {
                "time_window": (datetime(2017, 4, 1), datetime(2018, 4, 1)),
                "num_composites": num_composites,
                "temporal_reducers": valid_reducers,
                "indices": index_bands,
                "sentinel_bands": sentinel_bands,
            }

            list_of_kwargs.append(kwargs)

In [28]:
# Score each kwargs combination
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from ltm.data import sentinel_composite
from ltm.features import load_raster, interpolate_X, to_float32, drop_nan_rows
from pathlib import Path
from tqdm import tqdm

scoring = {
    "R2 Score": make_scorer(r2_score),
    "Mean Absolute Error": make_scorer(mean_absolute_error),
    "Root Mean Squared Error": make_scorer(mean_squared_error, squared=False)
}
X_counter = 1


def create_X_name(kwargs):
    global X_counter

    X_name = f"X_{X_counter}.tif"
    X_counter += 1

    return X_name

def create_row(kwargs):
    # Create dictionary with strings as keys and values
    row = {
        "Reducer": " ".join(kwargs["temporal_reducers"]),
        "Composites": kwargs["num_composites"],
    }

    return row

def check_row_exists(row, report_path):
    if not Path(report_path).exists():
        return False

    df = pd.read_csv(report_path)

    for _, existing_row in df.iterrows():
        existing_row = dict(existing_row)
        if all(k in existing_row.keys() and v == existing_row[k]
               for k, v in row.items()):
            return True

    return False

def create_X(y_path, kwargs, create_X_name=create_X_name):
    # Use existing X_{temporal_reducer}_{num_composites}.tif
    Xs = []
    for reducer in kwargs["temporal_reducers"]:
        X_path_to = f"../data/processed/reducer_composites/X_{reducer}_{kwargs['num_composites']}.tif"
        X = load_raster(X_path_to)
        Xs.append(X)
    
    # Fill missing values by linear (circular) interpolation
    X = pd.concat(Xs, axis=1)
    interpolated_X = interpolate_X(X)
    interpolated_X = to_float32(interpolated_X)
    
    return interpolated_X

def callback(kwargs, scores, report_path, create_row=create_row):
    row = create_row(kwargs)
    row.update(scores)

    if not Path(report_path).exists():
        df = pd.DataFrame([row])
    else:
        df = pd.read_csv(report_path)
        df.loc[len(df)] = row

    df.to_csv(report_path, index=False)

def evaluate_kwargs(list_of_kwargs, scoring, report_path, y_path, create_X_name=create_X_name, create_row=create_row, callback=callback):
    y = load_raster(y_path)
    model = RandomForestRegressor(n_jobs=-1, random_state=42)

    for kwargs in tqdm(list_of_kwargs):
        if check_row_exists(create_row(kwargs), report_path):
            continue

        X = create_X(y_path, kwargs, create_X_name)
        X_no_nan, y_no_nan = drop_nan_rows(X, y)

        if len(X_no_nan) < 5:
            scores = {key: np.nan for key in scoring.keys()}
        else:
            scores = cross_validate(model, X_no_nan, y_no_nan, scoring=scoring, n_jobs=-1)
            scores = {key: scores[f"test_{key}"].mean() for key in scoring.keys()}
        
        callback(kwargs, scores, report_path, create_row)


evaluate_kwargs(list_of_kwargs, scoring, "../reports/reducer_clustering.csv", "../data/processed/y.tif")

  3%|▎         | 3/97 [25:19<13:13:15, 506.33s/it]


KeyboardInterrupt: 

In [14]:
num_bands = len(list_of_kwargs[3]["indices"]) + len(list_of_kwargs[3]["sentinel_bands"])
num_composites = list_of_kwargs[3]["num_composites"]
num_reducers = len(list_of_kwargs[3]["temporal_reducers"])

num_bands, num_composites, num_reducers

(31, 2, 22)