# SVM Classification

## Modules

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Get the parent directory of the current notebook
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../src"))

# Add the parent directory to sys.path
sys.path.insert(0, parent_dir)

from scripting import load_dems, calculate_aspect_from_dems, calculate_slope_from_dems, load_composites, print_dataframe
from scripting import load_features
from svm_pipeline import SVMS, load_SVMS, DegreeToSinCos
from generate_lc_map import expand_bands_and_reduce, create_empty_tif,process_chunk, load_map_and_plot, load_map_with_probs_and_plot, map_lc_codes_to_rgba

import dask
import dask.array as da
import dask.distributed
import matplotlib.pyplot as plt
import rasterio
from rasterio.windows import Window
from rasterio.transform import from_origin
from dask import delayed
import yaml
import pandas as pd
from svm_pipeline import get_scaler
from sklearn.decomposition import PCA
import numpy as np
import xarray as xr
from concurrent.futures import ThreadPoolExecutor, as_completed
import copy
import time

def read_yaml(file_path: str) -> dict:
    with open(file_path, 'r') as yaml_file: return yaml.safe_load(yaml_file)

def fix_paths_for_nb(input_dict, old_substring = "/home/hrlcuser/media", new_substring = "/media/datapart/lucazanolo"):
    return {
        key: (value.replace(old_substring, new_substring) if isinstance(value, str) else value)
        for key, value in input_dict.items()
    }

## Parameters

In [None]:
parameters = fix_paths_for_nb(read_yaml("/home/lucazanolo/luca-zanolo/scripts/config_files/7.generate_lc_map.yaml"))
os.makedirs(parameters["output_path"], exist_ok=True)
parameters["classified_image_path"] = f"{parameters['output_path']}/{parameters['tile_id']}_{parameters['composites_year']}_{parameters['chunks_limit']}r"
png_images_path = f"{parameters['output_path']}/images"
os.makedirs(png_images_path, exist_ok=True)
parameters

## Load xarray dataset

In [None]:
with dask.distributed.Client(
    processes=False,
    threads_per_worker=(os.cpu_count() or 2),
) as client:
    
    
    print(f"Dask dashboard: {client.dashboard_link}")
    print("Loading dataset.")

    glcm_year, glcm_month = str(parameters['features_path'].split('/')[-2]).split("-")

    composites = load_composites(parameters["composites_path"], year=parameters["composites_year"], tile=parameters["tile_id"])
    dems = load_dems(parameters["dems_path"] ,year=parameters["dems_year"], tile=parameters["tile_id"])
    slope = calculate_slope_from_dems(dems.band_data)
    aspect = calculate_aspect_from_dems(dems.band_data)
    glcm_features = load_features(parameters['features_path'])

    dataset = composites.assign({
                        "dems":dems.band_data,
                        "slopes":slope,
                        "aspects":aspect}).sel(tile=parameters["tile_id"])
    
    dataset = dataset.assign({
        f_name : feature.isel(time=0) for f_name, feature in glcm_features.items() # Only January GLCM features are kept. 
    })
    
    
    print(f"Dems:\n{dems}\n\n")
    print(f"Aspect:\n{aspect}\n\n")
    print(f"Slope:\n{slope}\n\n")
    print(f"Composites:\n{composites}\n\n")
    print(f"Composites:\n{composites}\n\n")
    print(f"Dataset:\n{dataset}\n\n")
    
    dataset = dataset.unify_chunks()
    
    svm = load_SVMS(parameters["model_path"])
    features = list(svm.preprocessing_metadata.keys())
    dataset_flattened = expand_bands_and_reduce(dataset, features)
    scalers = {f : get_scaler(info["method"], info["params"]) for f, info in svm.preprocessing_metadata.items()}
    to_exlude = set(['ground_truth_index', 'ground_truth_label', 'split', 'x', 'y'])
    dataset_selected = dataset_flattened[svm.features_selected]

    print("\n\nInitial xarray dataset flattened:\n\n",dataset_flattened)
    print("\n\nDataset flattened selected:\n\n",dataset_selected)
    

## Tests

### Process patch toy example

In [None]:

X_SIZE, Y_SIZE = 1000, 1000
CHUNK_SIZE = 250  # Chunk 250x250 for X e Y

ds = xr.Dataset({
    'var1': (['x', 'y'], da.random.random((X_SIZE, Y_SIZE), chunks=(CHUNK_SIZE, CHUNK_SIZE))),
    'var2': (['x', 'y'], da.random.random((X_SIZE, Y_SIZE), chunks=(CHUNK_SIZE, CHUNK_SIZE))),
    'var3': (['x', 'y'], da.random.random((X_SIZE, Y_SIZE), chunks=(CHUNK_SIZE, CHUNK_SIZE)))
})

def predict_svm(var1, var2, var3):
    """Restituisce 3 array di output con la stessa forma dell'input"""
    pred1 = var1 * 0.5 + var2 * 0.3 - var3 * 0.2  # SVM 1
    pred2 = var1 * 0.4 - var2 * 0.1 + var3 * 0.7  # SVM 2
    pred3 = var1 * 0.6 + var2 * 0.2 + var3 * 0.2  # SVM 3
    return np.stack([pred1, pred2, pred3], axis=0)

predictions = xr.apply_ufunc(
    predict_svm,
    ds['var1'], ds['var2'], ds['var3'],
    input_core_dims=[("x", "y")] * 3,
    output_core_dims=[("svm", "x", "y")],
    vectorize=True,
    dask="allowed",
    output_dtypes=[np.float64],
)
print(predictions)

pred_ds = xr.Dataset({
    'svm1': (["x", "y"], predictions.sel(svm=0).data),
    'svm2': (["x", "y"], predictions.sel(svm=1).data),
    'svm3': (["x", "y"], predictions.sel(svm=2).data),
})

print(pred_ds)
pred_ds = pred_ds.compute()


### Process single pixel

In [None]:
def process_pixel(features, scalers, svm, x, y, args):

    preprocessed_features = {}
    print(f"Processing point ({x}, {y})")

    # Step 1: Preprocess Features
    for feature_name, feature_value in features.items():
        
        if feature_name in ["x", "y"]:
            continue  # Skip coordinates
        
        feature_value = feature_value.reshape(-1, 1)  # Ensure 2D input for scaler
        curr_scaler = scalers[feature_name]
        
        if isinstance(curr_scaler, DegreeToSinCos):
            sin_col, cos_col = curr_scaler.transform(feature_value)
            preprocessed_features[f"{feature_name}_sin"] = sin_col.flatten()
            preprocessed_features[f"{feature_name}_cos"] = cos_col.flatten()
            print(f"Scaled {feature_name} to sin: {sin_col}, cos: {cos_col}")
            
        else:    
            preprocessed_features[feature_name] = curr_scaler.transform(feature_value).flatten()
            print(f"Scaled {feature_name}: {preprocessed_features[feature_name]} with {curr_scaler}")

    # Convert preprocessed_features dictionary to an ordered NumPy array
    feature_array = np.array([preprocessed_features[key] for key in preprocessed_features])

    # Step 2: Apply PCA if enabled
    if svm.pca_metadata is not None:
        pca = args['pca']
        feature_names = pca.feature_names_in_

        # Ensure the correct order of features
        if not all(f in preprocessed_features for f in feature_names):
            raise ValueError(f"Some PCA features are missing: {set(feature_names) - set(preprocessed_features)}")

        # Extract the features in the correct order
        pca_input = np.array([preprocessed_features[f] for f in feature_names]).reshape(1, -1)  # PCA needs 2D

        # Apply PCA transformation
        pca_transformed = pca.transform(pca_input).flatten()
        pca_transformed = pca_transformed.reshape(1, -1)

        # Replace feature_array with PCA result
        feature_array = pca_transformed

    
    output = {"x":x, "y":y}

    # 1. Prediction with Multiclass SVM not calibrated
    svm.use_binary_svms = False
    svm.use_multiclass_svm_calibrated = False
    svm.use_binary_svms_with_softmax = False
    output.update({f"predictions_SvmMc" : svm.predict(feature_array)})
    
    # 2. Prediction with Multiclass SVM calibrated
    svm.use_binary_svms = False
    svm.use_multiclass_svm_calibrated = True
    svm.use_binary_svms_with_softmax = False
    output.update({f"predictions_SvmMcCal" : svm.predict(feature_array)})
    
    # 3. Prediction with Binary SVMs
    svm.use_binary_svms = True
    svm.use_multiclass_svm_calibrated = False
    svm.use_binary_svms_with_softmax = False
    output.update({f"predictions_SvmsBin" : svm.predict(feature_array)})
    
    return output


args = {}
if svm.pca_metadata is not None:
    pca_model = PCA()
    pca_model.components_ = svm.pca_metadata["params"]["components_"]
    pca_model.n_components_ = svm.pca_metadata["params"]["n_components_"]
    pca_model.explained_variance_ = svm.pca_metadata["params"]["explained_variance_"]
    pca_model.singular_values_ = svm.pca_metadata["params"]["singular_values_"]
    pca_model.mean_ = svm.pca_metadata["params"]["mean_"]
    pca_model.n_samples_ = svm.pca_metadata["params"]["n_samples_"]
    pca_model.noise_variance_ = svm.pca_metadata["params"]["noise_variance_"]
    pca_model.n_features_in_ = svm.pca_metadata["params"]["n_features_in_"]
    pca_model.feature_names_in_ = svm.pca_metadata["params"]["feature_names_in_"]

    args["pca"] = pca_model


first_pixel = dataset_selected.isel(x=0, y=0).compute()
print(f"First pixel:\n\n {first_pixel}")

# Extract features for a single pixel
first_pixel_values = {var: first_pixel[var].data for var in dataset_selected.data_vars}
print(f"\n\nFirst pixel values:\n\n {first_pixel}")

output = process_pixel(first_pixel_values, scalers, svm, args = args, x=0, y=1)

print(f"Predictions for pixel (0,1): {output}")


### Process a single patch

In [None]:
def predict_svm_patch(features_array, feature_names, scalers, svm, x_coords, y_coords, args, output_images):

    patch_x, patch_y = features_array.shape[1:]  # Get spatial patch size
    preprocessed_features = np.zeros_like(features_array)  # To store processed values
    
    print(f"Processing patch of shape {features_array.shape} at X range {x_coords} and Y range {y_coords}")

    # Step 1: Preprocess Features
    for i, feature_name in enumerate(feature_names):
        feature_values = features_array[i, :, :].reshape(-1, 1)  # Flatten patch for batch processing
        curr_scaler = scalers[feature_name]

        if isinstance(curr_scaler, DegreeToSinCos):
            sin_col, cos_col = curr_scaler.transform(feature_values)
            preprocessed_features[i, :, :] = sin_col.reshape(patch_x, patch_y)  # Store transformed values
        else:
            preprocessed_features[i, :, :] = curr_scaler.transform(feature_values).reshape(patch_x, patch_y)

    # Step 2: Apply PCA if enabled
    if svm.pca_metadata is not None:
        pca = args['pca']
        feature_names_pca = pca.feature_names_in_

        # Ensure correct feature order for PCA
        feature_indices = [feature_names.index(f) for f in feature_names_pca]
        pca_input = preprocessed_features[feature_indices, :, :].reshape(len(feature_names_pca), -1).T  # (n_samples, n_features)

        # Apply PCA
        pca_transformed = pca.transform(pca_input).T.reshape(pca.n_components_, patch_x, patch_y)

        # Replace feature array with PCA result
        preprocessed_features = pca_transformed

    # Step 3: Make predictions using SVMs
    preprocessed_features_flat = preprocessed_features.reshape(preprocessed_features.shape[0], -1).T  # (n_samples, n_features)

    output = {}

    # 1. Multiclass SVM (not calibrated)
    svm.use_binary_svms = False
    svm.use_multiclass_svm_calibrated = False
    svm.use_binary_svms_with_softmax = False
    predictions_SvmMc = svm.predict(preprocessed_features_flat).reshape(patch_x, patch_y)

    # 2. Multiclass SVM (calibrated)
    svm.use_multiclass_svm_calibrated = True
    predictions_SvmMcCal = svm.predict(preprocessed_features_flat).reshape(patch_x, patch_y)

    # 3. Binary SVMs
    svm.use_binary_svms = True
    svm.use_multiclass_svm_calibrated = False
    predictions_SvmsBin = svm.predict(preprocessed_features_flat).reshape(patch_x, patch_y)

    # Convert label descriptions to label IDs
    def map_labels_to_ids(predictions):
        return np.vectorize(lambda label: label2id.get(label, 0))(predictions)  # Default to 0 if label not found

    predictions_SvmMc = map_labels_to_ids(predictions_SvmMc)
    predictions_SvmMcCal = map_labels_to_ids(predictions_SvmMcCal)
    predictions_SvmsBin = map_labels_to_ids(predictions_SvmsBin)

    # Write to output images
    output_images["predictions_SvmMc"][y_indexes[0]:y_indexes[-1]+1, x_indexes[0]:x_indexes[-1]+1] = predictions_SvmMc
    output_images["predictions_SvmMcCal"][y_indexes[0]:y_indexes[-1]+1, x_indexes[0]:x_indexes[-1]+1] = predictions_SvmMcCal
    output_images["predictions_SvmsBin"][y_indexes[0]:y_indexes[-1]+1, x_indexes[0]:x_indexes[-1]+1] = predictions_SvmsBin

    print(f"Written predictions to output images for patch {x_indexes[0]}:{x_indexes[-1]+1}, {y_indexes[0]}:{y_indexes[-1]+1}")



args = {}
if svm.pca_metadata is not None:
    pca_model = PCA()
    pca_model.components_ = svm.pca_metadata["params"]["components_"]
    pca_model.n_components_ = svm.pca_metadata["params"]["n_components_"]
    pca_model.explained_variance_ = svm.pca_metadata["params"]["explained_variance_"]
    pca_model.singular_values_ = svm.pca_metadata["params"]["singular_values_"]
    pca_model.mean_ = svm.pca_metadata["params"]["mean_"]
    pca_model.n_samples_ = svm.pca_metadata["params"]["n_samples_"]
    pca_model.noise_variance_ = svm.pca_metadata["params"]["noise_variance_"]
    pca_model.n_features_in_ = svm.pca_metadata["params"]["n_features_in_"]
    pca_model.feature_names_in_ = svm.pca_metadata["params"]["feature_names_in_"]

    args["pca"] = pca_model

id2label = {id : row['description'] for id, row in pd.read_csv(parameters["labels_path"]).iterrows()}
label2id = {row['description'] : id for id, row in pd.read_csv(parameters["labels_path"]).iterrows()}
args['label2id'] = label2id

def create_image_for_classification(dataset):
    output_shape = (10980, 10980)
    return xr.DataArray(
        np.full(output_shape, np.nan, dtype=np.uint8),
        dims=("y", "x"),
        coords={"y": dataset["y"], "x": dataset["x"]},
    ).assign_coords({"spatial_ref": dataset.spatial_ref})


output_images = {
    "predictions_SvmMc": create_output_image(),
    "predictions_SvmMcCal": create_output_image(),
    "predictions_SvmsBin": create_output_image(),
    "predictions_SvmsBinSm": create_output_image(),
}

chunk_size = 50


# Extract patch from dataset
patch = dataset_selected.isel(
    x=slice(0, chunk_size),
    y=slice(0, chunk_size)
).compute()

# Convert patch to NumPy array of shape (n_features, patch_x, patch_y)
feature_names = list(patch.data_vars.keys())
features_array = np.array([patch[var].data for var in feature_names])
print(f"Features array shape: {features_array.shape}")

# Extract x and y indexes
x_indexes = np.arange(0, chunk_size)
y_indexes = np.arange(0, chunk_size)

# Test function on the patch
output_patch = predict_svm_patch(features_array, feature_names, scalers, svm, x_indexes, y_indexes, args, output_images)
print("Patch Predictions:")
print(output_patch)



### Process chunks in parallel



In [None]:
def process_chunk(
    features_array: np.ndarray, 
    feature_names: list, 
    scalers: dict, 
    svm_path : str, 
    args: dict, 

) -> tuple:
   
    svm = load_SVMS(svm_path)
    patch_x, patch_y = features_array.shape[1:]  # Patch size
    preprocessed_features = np.zeros_like(features_array)

    # Step 1: Preprocess Features
    for i, feature_name in enumerate(feature_names):
        
        feature_values = features_array[i, :, :].reshape(-1, 1)  # Flatten for batch processing
        curr_scaler = scalers[feature_name]
        preprocessed_features[i, :, :] = curr_scaler.transform(feature_values).reshape(patch_x, patch_y)

    # Step 2: Apply PCA if enabled
    if svm.pca is not None:
        pca = args['pca']
        feature_names_pca = pca.feature_names_in_
        feature_indices = [feature_names.index(f) for f in feature_names_pca]
        pca_input = preprocessed_features[feature_indices, :, :].reshape(len(feature_names_pca), -1).T 
    
        # Apply PCA
        pca_transformed = pca.transform(pca_input).T.reshape(pca.n_components_, patch_x, patch_y)
        preprocessed_features = pca_transformed

    # Step 3: Make Predictions
    preprocessed_features_flat = preprocessed_features.reshape(preprocessed_features.shape[0], -1).T  

    label_mapper = args['label_mapper']
    # --- Multiclass SVM (not calibrated) ---
    svm.use_binary_svms_with_softmax = False
    svm.use_binary_svms = False
    svm.use_multiclass_svm_calibrated = False
    predictions_SvmMc = label_mapper(svm.predict(preprocessed_features_flat).reshape(patch_x, patch_y))
    probabilities_SvmMc = svm.predict_proba(preprocessed_features_flat).reshape(patch_x, patch_y, -1)

    # --- Multiclass SVM (calibrated) ---
    svm.use_binary_svms_with_softmax = False
    svm.use_binary_svms = False
    svm.use_multiclass_svm_calibrated = True
    predictions_SvmMcCal = label_mapper(svm.predict(preprocessed_features_flat).reshape(patch_x, patch_y))
    probabilities_SvmMcCal = svm.predict_proba(preprocessed_features_flat).reshape(patch_x, patch_y, -1)

    # --- Binary SVMs ---
    svm.use_binary_svms_with_softmax = False
    svm.use_binary_svms = True
    svm.use_multiclass_svm_calibrated = False
    predictions_SvmsBin = label_mapper(svm.predict(preprocessed_features_flat).reshape(patch_x, patch_y))
    probabilities_SvmsBin = svm.predict_proba(preprocessed_features_flat).reshape(patch_x, patch_y, -1)


    # --- Binary SVMs with Softmax---
    svm.use_binary_svms_with_softmax = True
    svm.use_binary_svms = True
    svm.use_multiclass_svm_calibrated = False
    predictions_SvmsBinSm = label_mapper(svm.predict(preprocessed_features_flat).reshape(patch_x, patch_y))
    probabilities_SvmsBinSm = svm.predict_proba(preprocessed_features_flat).reshape(patch_x, patch_y, -1)

    #print(f"Processed patch {x_indexes[0]}:{x_indexes[-1]+1}, {y_indexes[0]}:{y_indexes[-1]+1}")

    del feature_values, preprocessed_features, preprocessed_features_flat, features_array

    return predictions_SvmMc, predictions_SvmMcCal, predictions_SvmsBin, predictions_SvmsBinSm, probabilities_SvmMc, probabilities_SvmMcCal, probabilities_SvmsBin, probabilities_SvmsBinSm

def create_output_image(dims_labels):
    output_shape = (len(dims_labels), 10980, 10980)  # Define the full image size
    return xr.DataArray(
        np.full(output_shape, np.nan, dtype=np.uint8),  # Use uint8 for classification
        dims=("band", "y", "x"),
        coords={"band": dims_labels, "y": dataset["y"], "x": dataset["x"]},
    ).assign_coords({"spatial_ref": dataset.spatial_ref})
    
band_dim_labels = svm.classes.tolist() + ["labels"]
print(f"band dim labels: {band_dim_labels}")
output_images = {
    "predictions_SvmMc": create_output_image(band_dim_labels),
    "predictions_SvmMcCal": create_output_image(band_dim_labels),
    "predictions_SvmsBin": create_output_image(band_dim_labels),
    "predictions_SvmsBinSm": create_output_image(band_dim_labels),
}
    
args = {}

if svm.pca_metadata is not None:
    pca_model = PCA()
    pca_model.components_ = svm.pca_metadata["params"]["components_"]
    pca_model.n_components_ = svm.pca_metadata["params"]["n_components_"]
    pca_model.explained_variance_ = svm.pca_metadata["params"]["explained_variance_"]
    pca_model.singular_values_ = svm.pca_metadata["params"]["singular_values_"]
    pca_model.mean_ = svm.pca_metadata["params"]["mean_"]
    pca_model.n_samples_ = svm.pca_metadata["params"]["n_samples_"]
    pca_model.noise_variance_ = svm.pca_metadata["params"]["noise_variance_"]
    pca_model.n_features_in_ = svm.pca_metadata["params"]["n_features_in_"]
    pca_model.feature_names_in_ = svm.pca_metadata["params"]["feature_names_in_"]

    args["pca"] = pca_model

feature_names = list(dataset_selected.data_vars)
n_features = len(feature_names)
patch_x, patch_y = 32, 32
labels_df =  pd.read_csv(parameters["labels_path"])
id2label = {row['LC_code'] : row['description'] for id, row in labels_df.iterrows()}
label2id = {label : id for id, label in id2label.items()}
label_mapper = np.vectorize(lambda label: label2id[label])

args["label_mapper"] = label_mapper

# Create a random features_array of shape (n_features, patch_x, patch_y)
features_array = np.random.rand(n_features, patch_x, patch_y).astype(np.float32)
print(f"Features names:", feature_names)
print(f"Input dataset: {features_array.shape}", features_array)
print(f"ID2label: ", id2label)
a = process_chunk(
                features_array=features_array,
                feature_names=feature_names,
                scalers=scalers,
                svm_path=parameters['model_path'],
                args = args
                )

for i in range(len(a)):
    print(a[i].shape)

In [None]:

def process_and_write_chunk(
    dataset, x_start, y_start, chunk_size, feature_names, scalers, svm_path, args, output_paths
):
    patch = dataset.isel(
        x=slice(x_start, x_start + chunk_size),
        y=slice(y_start, y_start + chunk_size)
    )

    features_array = np.stack([
        patch[var].values for var in feature_names
    ])  # Shape: (n_features, patch_y, patch_x)

    results = process_chunk(
        features_array=features_array,
        feature_names=feature_names,
        scalers=scalers,
        svm_path=svm_path,
        args=args
    )

    predictions = {
        "predictions_SvmMc": results[0],
        "predictions_SvmMcCal": results[1],
        "predictions_SvmsBin": results[2],
        "predictions_SvmsBinSm": results[3]
    }

    probs = {
        "predictions_SvmMc": results[4],
        "predictions_SvmMcCal": results[5],
        "predictions_SvmsBin": results[6],
        "predictions_SvmsBinSm": results[7]
    }

    for key in predictions:
        pred = predictions[key]
        prob = probs[key]
        pred = pred.astype(np.uint8)
        prob = (prob * 255).astype(np.uint8)

        stacked = np.concatenate(
            [prob, pred[:, :, np.newaxis]], axis=-1  # shape: (patch_y, patch_x, n_classes+1)
        )
        write_chunk_to_tif(output_paths[key], stacked, args['band_labels'], x_start, y_start)



def create_empty_output(name, template, dim_labels, dtype=np.uint8):
    shape = (len(dim_labels), template.sizes["y"], template.sizes["x"])
    data = np.full(shape, np.nan, dtype=dtype)
    da = xr.DataArray(
        data,
        dims=("band", "y", "x"),
        coords={"band": dim_labels, "y": template["y"], "x": template["x"]},
        attrs={"transform": template.rio.transform()},
    ).rio.write_crs(template.rio.crs)
    da.rio.to_raster(f"{name}.tif")
    return f"{name}.tif"

def write_chunk_to_tif(tif_path, chunk_array, bands, x_start, y_start):
    with rasterio.open(tif_path, "r+", lock=False) as dst:
        for i, band in enumerate(bands):
            window = Window(x_start, y_start, chunk_array.shape[1], chunk_array.shape[0])
            dst.write(chunk_array[:, :, i].astype(dst.dtypes[0]), indexes=i+1, window=window)

with dask.distributed.Client(
    processes=False,
    threads_per_worker=(os.cpu_count() or 2),
) as client:
    
    print(f"Dask dashboard: {client.dashboard_link}")

    args = {}

    if svm.pca_metadata is not None:
        pca_model = PCA()
        pca_model.components_ = svm.pca_metadata["params"]["components_"]
        pca_model.n_components_ = svm.pca_metadata["params"]["n_components_"]
        pca_model.explained_variance_ = svm.pca_metadata["params"]["explained_variance_"]
        pca_model.singular_values_ = svm.pca_metadata["params"]["singular_values_"]
        pca_model.mean_ = svm.pca_metadata["params"]["mean_"]
        pca_model.n_samples_ = svm.pca_metadata["params"]["n_samples_"]
        pca_model.noise_variance_ = svm.pca_metadata["params"]["noise_variance_"]
        pca_model.n_features_in_ = svm.pca_metadata["params"]["n_features_in_"]
        pca_model.feature_names_in_ = svm.pca_metadata["params"]["feature_names_in_"]

        args["pca"] = pca_model

    feature_names = list(dataset_selected.data_vars)
    n_features = len(feature_names)
    patch_x, patch_y = 32, 32
    labels_df =  pd.read_csv(parameters["labels_path"])
    id2label = {row['LC_code'] : row['description'] for id, row in labels_df.iterrows()}
    label2id = {label : id for id, label in id2label.items()}
    label_mapper = np.vectorize(lambda label: label2id[label])

    args["label_mapper"] = label_mapper

    chunk_size = 512
    x_size = dataset.sizes["x"]
    y_size = dataset.sizes["y"]
    band_dim_labels = svm.classes.tolist() + ["labels"]

    # Crea immagini output vuote
    output_paths = {
        name: create_empty_output(name, dataset, band_dim_labels) for name in [
            "predictions_SvmMc", "predictions_SvmMcCal", "predictions_SvmsBin", "predictions_SvmsBinSm"
        ]
    }

    tasks = []
    args['band_labels'] = band_dim_labels
    for y_start in range(0, y_size, chunk_size):
        for x_start in range(0, x_size, chunk_size):
            print(f"Processing chunk from {y_start} to {x_start}")
            task = process_and_write_chunk(
                dataset=dataset_selected,
                x_start=x_start,
                y_start=y_start,
                chunk_size=chunk_size,
                feature_names=feature_names,
                scalers=scalers,
                svm_path=parameters["model_path"],
                args=args,
                output_paths=output_paths
            )



In [None]:
for i in range(len(a)):
    print(a[i].shape)

In [None]:
with dask.distributed.Client(
    processes=False,
    threads_per_worker=(os.cpu_count() or 2),
) as client:
    
    print(f"Dask dashboard: {client.dashboard_link}")

    print("\n\nInitial xarray dataset flattened:\n\n",dataset_flattened)
    
    args = {}
    
    if svm.pca_metadata is not None:
        pca_model = PCA()
        pca_model.components_ = svm.pca_metadata["params"]["components_"]
        pca_model.n_components_ = svm.pca_metadata["params"]["n_components_"]
        pca_model.explained_variance_ = svm.pca_metadata["params"]["explained_variance_"]
        pca_model.singular_values_ = svm.pca_metadata["params"]["singular_values_"]
        pca_model.mean_ = svm.pca_metadata["params"]["mean_"]
        pca_model.n_samples_ = svm.pca_metadata["params"]["n_samples_"]
        pca_model.noise_variance_ = svm.pca_metadata["params"]["noise_variance_"]
        pca_model.n_features_in_ = svm.pca_metadata["params"]["n_features_in_"]
        pca_model.feature_names_in_ = svm.pca_metadata["params"]["feature_names_in_"]

        args["pca"] = pca_model

    # Load class mappings

    # Define the patch size
    chunk_size = parameters["chunk_size"]
    dataset_selected = dataset_selected.chunk({"x": chunk_size, "y": chunk_size})
    feature_names = list(dataset_selected.data_vars.keys())

    tasks = []
    i = 0
    limited = parameters["chunks_limit"] is not None
    limit = parameters["chunks_limit"]
    chunk_positions = []  # Store the chunk positions in the same order as tasks

    print("Processing dataset chuncks:")
    print(f" - Limit: {limit}")
    
    predictions_da = xr.apply_ufunc(
        dataset_selected,
        kwargs={
            "svm": load_SVMS(parameters["model_path"]),
            "scalers": scalers,
            "apply_pca": True,
            "pca": pca_model,
        },
        input_core_dims=[],  # one [] per input feature
        output_core_dims=[["band"]],  # output has an extra "band" dimension
        vectorize=True,               # essential since we have multiple arrays
        dask="parallelized",
        output_dtypes=[np.float32],
        output_sizes={"band": 1 + len(svm.classes)},  # 1 prediction band + N probability bands
    )

            
    print(f"Predictions - Computing {len(tasks)} Dask's task")
    results = dask.compute(*tasks)

In [None]:
# Initialize Dask Client
with dask.distributed.Client(
    processes=False,
    threads_per_worker=(os.cpu_count() or 2),
) as client:
    
    print(f"Dask dashboard: {client.dashboard_link}")
    band_dim_labels = svm.classes.tolist() + ["labels"]

    def create_output_image():
        output_shape = (len(band_dim_labels), 10980, 10980)  # Define the full image size
        return xr.DataArray(
            np.full(output_shape, np.nan, dtype=np.uint8),  # Use uint8 for classification
            dims=("band", "y", "x"),
            coords={"band": band_dim_labels, "y": dataset["y"], "x": dataset["x"]},
        ).assign_coords({"spatial_ref": dataset.spatial_ref})
        
    output_images = {
        "predictions_SvmMc": create_output_image(),
        "predictions_SvmMcCal": create_output_image(),
        "predictions_SvmsBin": create_output_image(),
        "predictions_SvmsBinSm": create_output_image(),
    }

    print("Saving land cover maps to disk")

    # Ensure the number of results matches the chunk positions
    if len(results) != len(chunk_positions):
        print("Mismatch between computed results and expected chunk positions!")
        raise ValueError("Computed results do not match expected chunk positions.")

    # Assign values based on chunk positions
    for i, ((predictions_SvmMc, 
            predictions_SvmMcCal, 
            predictions_SvmsBin, 
            #predictions_SvmsBinSm,
            predProb_SvmMc, 
            predProb_SvmMcCal,
            predProb_SvmsBin,
            #predProb_SvmsBinSm
            ), 
            (x_start, y_start)) in enumerate(zip(results, chunk_positions)):
        
        print(f"Writing results for chunk {i+1} at x={x_start}-{x_start+chunk_size}, y={y_start}-{y_start+chunk_size}")

        # Get Numpy arrays from Dask arrays
        pred_svm_mc = predictions_SvmMc.compute()
        pred_svm_mc_cal = predictions_SvmMcCal.compute()
        pred_svms_bin = predictions_SvmsBin.compute()
        #pred_svms_bin_sm = predictions_SvmsBinSm.compute()
        predProb_svm_mc = predProb_SvmMc.compute()
        predProb_svm_mc_cal = predProb_SvmMcCal.compute()
        predProb_svms_bin = predProb_SvmsBin.compute()
        #predProb_svms_bin_sm = predProb_SvmsBinSm.compute()

        output_images["predictions_SvmMc"].sel(band = 'labels').isel(
            y=slice(y_start, y_start + chunk_size), x=slice(x_start, x_start + chunk_size)
        ).data[:] = pred_svm_mc

        output_images["predictions_SvmMcCal"].sel(band = 'labels').isel(
            y=slice(y_start, y_start + chunk_size), x=slice(x_start, x_start + chunk_size)
        ).data[:] = pred_svm_mc_cal

        output_images["predictions_SvmsBin"].sel(band = 'labels').isel(
            y=slice(y_start, y_start + chunk_size), x=slice(x_start, x_start + chunk_size)
        ).data[:] = pred_svms_bin

        #output_images["predictions_SvmsBinSm"].sel(band = 'labels').isel(
            #y=slice(y_start, y_start + chunk_size), x=slice(x_start, x_start + chunk_size)
        #).data[:] = pred_svms_bin_sm

        # Write class-wise probabilities
        for class_index, class_name in enumerate(svm.classes):
            output_images["predictions_SvmMc"].sel(band=class_name).isel(
                y=slice(y_start, y_start + chunk_size),
                x=slice(x_start, x_start + chunk_size)
            ).data[:] = (predProb_svm_mc[:, :, class_index] * 255).astype(np.uint8)

            output_images["predictions_SvmMcCal"].sel(band=class_name).isel(
                y=slice(y_start, y_start + chunk_size),
                x=slice(x_start, x_start + chunk_size)
            ).data[:] = (predProb_svm_mc_cal[:, :, class_index] * 255).astype(np.uint8)

            output_images["predictions_SvmsBin"].sel(band=class_name).isel(
                y=slice(y_start, y_start + chunk_size),
                x=slice(x_start, x_start + chunk_size)
            ).data[:] = (predProb_svms_bin[:, :, class_index] * 255).astype(np.uint8)
            
            #output_images["predictions_SvmsBinSm"].sel(band=class_name).isel(
            #    y=slice(y_start, y_start + chunk_size),
            #    x=slice(x_start, x_start + chunk_size)
            #).data[:] = (predProb_svms_bin_sm[:, :, class_index] * 255).astype(np.uint8)
            
    print("Writing complete. Now saving images.")

    storage_paths = []
    
    for name, image in output_images.items():
        
        output_path = f"{parameters['classified_image_path']}_{name}.tif"
        image.rio.to_raster(output_path)  
        storage_paths.append(output_path)
          
        print(f"Saved classified image: {output_path}")

    print("All classification images saved successfully.")


## Final procedure


In [None]:
with dask.distributed.Client(
    processes=False,
    threads_per_worker=(4),
) as client:
    
    print(f"Dask dashboard: {client.dashboard_link}")
    print("\n\nInitial xarray dataset flattened:\n\n",dataset_flattened)
    
    args = {}
    
    if svm.pca is not None:
        pca_model = PCA()
        pca_model.components_ = svm.pca["params"]["components_"]
        pca_model.n_components_ = svm.pca["params"]["n_components_"]
        pca_model.explained_variance_ = svm.pca["params"]["explained_variance_"]
        pca_model.singular_values_ = svm.pca["params"]["singular_values_"]
        pca_model.mean_ = svm.pca["params"]["mean_"]
        pca_model.n_samples_ = svm.pca["params"]["n_samples_"]
        pca_model.noise_variance_ = svm.pca["params"]["noise_variance_"]
        pca_model.n_features_in_ = svm.pca["params"]["n_features_in_"]
        pca_model.feature_names_in_ = svm.pca["params"]["feature_names_in_"]
        args["pca"] = pca_model

    transform = dataset_selected.rio.transform()
    crs = dataset_selected.rio.crs
    height, width = 10980, 10980
    
    prefix = f"{parameters['tile_id']}_{parameters['composites_year']}_{parameters['chunks_limit']}r"
    svms_to_use = parameters["svms_to_use"]
    
    prefix = f"{parameters['tile_id']}_{parameters['composites_year']}_{parameters['chunks_limit']}r"
    output_paths = {}
    if 'svmMc' in svms_to_use:
        output_paths["svmMc"] = f"{parameters['output_path']}/{prefix}_predictions_SvmMc.tif"
    if 'svmMcCal' in svms_to_use:
        output_paths["svmMcCal"] = f"{parameters['output_path']}/{prefix}_predictions_SvmMcCal.tif"
    if 'svmsBin' in svms_to_use:
        output_paths["svmsBin"] = f"{parameters['output_path']}/{prefix}_predictions_SvmsBin.tif"
    if 'svmsBinSoftmax' in svms_to_use:
        output_paths["svmsBinSoftmax"] = f"{parameters['output_path']}/{prefix}_predictions_SvmsBinSoftmax.tif"
    
    band_labels = svm.classes.tolist() + ["labels"]
    print(f"Band labels ({len(band_labels)}): {band_labels}")
    
    for name, path in output_paths.items():
        create_empty_tif(path, width, height, len(band_labels), "uint8", transform, crs, band_labels)

    # Define the patch size
    chunk_size = parameters["chunk_size"]
    dataset_selected = dataset_selected.chunk({"x": chunk_size, "y": chunk_size})
    feature_names = list(dataset_selected.data_vars.keys())
    labels_df =  pd.read_csv(parameters["labels_path"])
    id2label = {row['LC_code'] : row['description'] for id, row in labels_df.iterrows()}
    label2id = {label : id for id, label in id2label.items()}
    label_mapper = np.vectorize(lambda label: label2id[label])

    args["label_mapper"] = label_mapper
    tasks = []
    i = 0
    limited = parameters["chunks_limit"] is not None
    limit = parameters["chunks_limit"]
    svm = load_SVMS(parameters['model_path'])
    args['svm'] = svm
    print("Processing dataset chunks:")
    print(f" - Limit: {limit}")

    for x_start in range(0, width, chunk_size):
        for y_start in range(0, height, chunk_size):
            start = time.perf_counter()
            print(f"Processing chunk {i}")
            patch = dataset_selected.isel(
                x=slice(x_start, x_start + chunk_size),
                y=slice(y_start, y_start + chunk_size)
            )
            
            features_array = da.stack([patch[var] for var in svm.features_selected]).compute()
            print(features_array.shape)
            args['task_id'] = i
            process_chunk(
                features_array,feature_names, scalers, args ,x_start, y_start, output_paths, svms_to_use
            )
            
            elapsed = time.perf_counter() - start
            print(f"Chunk {i} processed in {elapsed:.2f} seconds")
            
            del features_array
            i += 1
            
            if limited and limit == i:
                break
            
        if limited and limit == i:
            break

    print(f"Finished processing all {i} chunk(s).")

## Plot land coverage images and store them as .png

In [None]:

svm_mc_path = "/media/datapart/lucazanolo/SVM/lc_maps/21KUQ_2019_1r_predictions_SvmMc.tif"
svm_mccal_path = "/media/datapart/lucazanolo/SVM/lc_maps/21KUQ_2019_1r_predictions_SvmMcCal.tif"
svms_binary_path = "/media/datapart/lucazanolo/SVM/lc_maps/21KUQ_2019_1r_predictions_SvmsBin.tif"
old_map_path = "/media/datapart/lucazanolo/data/training_points/ESACCI-HRLC-L4-MAP-CL01-A02T21KUQ-10m-P1Y-2019-fv01.1.tif"

mc_map = load_map_with_probs_and_plot(svm_mc_path, band_labels, labels_df, "Multiclass SVM prediction", png_images_path)    
#mccal_map = load_map_with_probs_and_plot(svm_mccal_path, band_labels, labels_df, "Multiclass Calibrated SVM prediction", png_images_path)    
#svmsbin_map = load_map_with_probs_and_plot(svms_binary_path, band_labels, labels_df, "Binary SVMs predictions", png_images_path)    
old_map = load_map_and_plot(old_map_path, labels_df, "Previous Pipeline predictions", png_images_path)

In [None]:
labels_df =  pd.read_csv(parameters["labels_path"])

band_labels = svm.classes.tolist() + ["labels"]

prefix = f"{parameters['tile_id']}_{parameters['composites_year']}_{parameters['chunks_limit']}r"

output_paths = {
    "predictions_SvmMc": f"{parameters['output_path']}/{prefix}_predictions_SvmMc.tif",
    "predictions_SvmMcCal": f"{parameters['output_path']}/{prefix}_predictions_SvmMcCal.tif",
    "predictions_SvmsBin": f"{parameters['output_path']}/{prefix}_predictions_SvmsBin.tif"
}

for name, path in output_paths.items():
    _ = load_map_with_probs_and_plot(path, band_labels, labels_df, name, png_images_path)    

In [None]:
if parameters['baseline_lc_map_path'] is not None:
    _ = load_map_and_plot(parameters['baseline_lc_map_path'], labels_df, "Previous Pipeline predictions", png_images_path)

## Inspect land cover images

In [None]:

labels_df =  pd.read_csv(parameters["labels_path"])

lc_map_old = load_map_with_probs_and_plot(
    lc_map_path="/media/datapart/lucazanolo/data/training_points/ESACCI-HRLC-L4-MAP-CL01-A02T21KUQ-10m-P1Y-2019-fv01.1.tif",
    labels=["labels"],
    labels_df=labels_df,
    title="Existing pipeline - LC Map"
)

lc_map_new_svmMc = load_map_and_plot("/media/datapart/lucazanolo/SVM/lc_maps/best_svm/lc_maps/21KUQ_2019_Noner_predictions_SvmMc.tif", labels_df)
lc_map_new_svmMcCal = load_map_and_plot("/media/datapart/lucazanolo/SVM/lc_maps/best_svm/lc_maps/21KUQ_2019_Noner_predictions_SvmMcCal.tif", labels_df)

lc_map_new_svms = load_map_and_plot("/media/datapart/lucazanolo/SVM/lc_maps/best_svms/lc_maps/21KUQ_2019_Noner_predictions_SvmsBin.tif", labels_df)
lc_map_new_svmssoft = load_map_and_plot("/media/datapart/lucazanolo/SVM/lc_maps/best_svms/lc_maps/21KUQ_2019_Noner_predictions_SvmsBinSoftmax.tif", labels_df)


In [None]:
def plot_lc_map_with_histogram(lc_array: np.ndarray, labels_df: pd.DataFrame, title: str = "", save_path: str = None):

    # Build color map
    color_map = {
        row.LC_code: np.array([row.R, row.G, row.B, row.A]) / 255.0
        for _, row in labels_df.iterrows()
    }

    # Generate RGBA image
    rgba_image = map_lc_codes_to_rgba(lc_array, color_map)

    # Compute class histogram
    unique_codes, counts = np.unique(lc_array, return_counts=True)

    descriptions = []
    bar_colors = []

    for code in unique_codes:
        row = labels_df[labels_df['LC_code'] == code]
        if not row.empty:
            desc = row['description'].values[0]
            descriptions.append(desc)
            bar_colors.append(row[['R', 'G', 'B']].values[0] / 255.0)
        else:
            descriptions.append(f"Code {code}")
            bar_colors.append([0.5, 0.5, 0.5])  # Unknown code

    # Create combined figure with constrained_layout to align widths
    fig, axes = plt.subplots(
        2, 1, 
        figsize=(14, 18), 
        gridspec_kw={'height_ratios': [3, 2]}, 
        constrained_layout=True
    )

    # Plot LC map
    axes[0].imshow(rgba_image)
    axes[0].set_title(title, fontsize=18, pad=15)
    axes[0].axis("off")

    # Plot histogram
    bars = axes[1].barh(range(len(descriptions)), counts, color=bar_colors)
    axes[1].set_xlabel("Number of Pixels", fontsize=14)
    axes[1].invert_yaxis()
    axes[1].set_title("Predicted classes distribution", fontsize=16, pad=10)

    # --- REMOVE Y TICKS and LABELS ---
    axes[1].set_yticks([])  # remove ticks
    axes[1].set_yticklabels([])  # remove labels

    # --- Add class names INSIDE the bars ---
    for i, bar in enumerate(bars):
        width = bar.get_width()
        axes[1].text(
            width * 0.01,              # Slightly after the start of the bar
            bar.get_y() + bar.get_height() / 2,  # Vertical center of the bar
            f"  {descriptions[i]}",
            va='center', ha='left',
            fontsize=12,
            fontweight='bold',
            color='white' if width > 15e6 else 'black'  # contrast
        )

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Saved combined report to {save_path}")

    plt.show()




plot_lc_map_with_histogram(lc_map_old, labels_df, title="21KUQ 2019 - LC map with existing pipeline\n", save_path=f"lcmap_old.png")
plot_lc_map_with_histogram(lc_map_new_svmMc, labels_df, title="21KUQ 2019 - LC map with new pipeline - Multiclass SVM\n", save_path=f"lcmap_new_svmMc.png")
plot_lc_map_with_histogram(lc_map_new_svmMcCal, labels_df, title="21KUQ 2019 - LC map with new pipeline - Multiclass SVM Calibrated\n", save_path=f"lcmap_new_svmMcCal.png")
plot_lc_map_with_histogram(lc_map_new_svms, labels_df, title="21KUQ 2019 - LC map with existing pipeline - SVMs\n", save_path=f"lcmap_new_svms.png")
plot_lc_map_with_histogram(lc_map_new_svmssoft, labels_df, title="21KUQ 2019 - LC map with new pipeline - SVMs with Softmax\n", save_path=f"lcmap_new_svmssoft.png")
