In [None]:
## Load data 

In [None]:
import geopandas as gpd
import numpy as np

# Load grid data
grid = gpd.read_file("/path/to/your/data/data.gpkg")

# Assign binary labels based on conditions
grid["label"] = np.where(
    (grid["vegetation"] <= 0.95)
    & (grid["ghsl"] >= 0.5)
    & (grid["osm"] <= 0.5)
    & (grid["favelas"] > 0.9),
    1,
    np.where(
        (grid["vegetation"] <= 0.95)
        & (grid["ghsl"] >= 0.5)
        & (grid["osm"] <= 0.5)
        & (grid["favelas"] == 0),
        0,
        np.nan,
    ),
)

# Keep only labeled data
dataset = grid[grid["label"].notna()]

# Load zones shapefile
zones = gpd.read_file("/path/to/your/data/zones.shp")

# Assign each cell to a zone based on centroid position
dataset["centroid"] = dataset.geometry.centroid
points_zones = gpd.sjoin(
    dataset.set_geometry("centroid"),
    zones[["fid", "geometry"]],
    how="left",
    predicate="within",
)
dataset["zone"] = points_zones["fid"]
dataset = dataset.drop(columns=["centroid"])
dataset = dataset[dataset["zone"].notna()]

In [None]:
## Plot feature importances and correlation matrix 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Font configuration
font_path = "/usr/share/fonts/truetype/cmu/cmunrm.ttf"
font_manager.fontManager.addfont(font_path)
font_prop = font_manager.FontProperties(fname=font_path)
plt.rcParams["font.family"] = font_prop.get_name()

# Feature columns
feature_cols = [
    "vegetation", "slope", "profile_co", "entropy",
    "nodes", "roads", "mean_conne", "min_connex", "max_connex"
]

# Prepare data
X = np.array(dataset[feature_cols].apply(lambda row: row.tolist(), axis=1).to_list())
y = np.array(dataset["label"].to_list())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
importances = model.feature_importances_

# Vertical bar plot
plt.figure(figsize=(6, 5))
plt.bar(range(len(importances)), importances, align="center", color="steelblue")
plt.xticks(
    range(len(importances)),
    [
        "Vegetation", "Slope", "Profile convexity", "Entropy",
        "OSM nodes", "OSM roads", "Mean connectivity",
        "Minimum connectivity", "Maximum connectivity"
    ],
    rotation=90,
    fontsize=11
)
plt.yticks(fontsize=11)
plt.ylabel("Feature Importance", fontsize=12)
plt.tight_layout()
plt.savefig("feature_importances.png", dpi=300, bbox_inches="tight")
plt.show()

# Correlation matrix
correlation_matrix = np.corrcoef(X, rowvar=False)
plt.figure(figsize=(7, 5))
plt.imshow(correlation_matrix, cmap="Reds", interpolation="none")

cbar = plt.colorbar()
cbar.ax.tick_params(labelsize=11)

plt.xticks(
    range(len(feature_cols)),
    [
        "Vegetation", "Slope", "Profile convexity", "Entropy",
        "OSM nodes", "OSM roads", "Mean connectivity",
        "Minimum connectivity", "Maximum connectivity"
    ],
    rotation=90,
    fontsize=10
)
plt.yticks(
    range(len(feature_cols)),
    [
        "Vegetation", "Slope", "Profile convexity", "Entropy",
        "OSM nodes", "OSM roads", "Mean connectivity",
        "Minimum connectivity", "Maximum connectivity"
    ],
    fontsize=10
)
plt.grid(False)
plt.tight_layout()
plt.savefig("correlation_matrix.png", dpi=300, bbox_inches="tight")
plt.show()

# Horizontal bar plot
plt.figure(figsize=(6, 4))
plt.barh(range(len(importances)), importances, align="center", color="steelblue")
plt.yticks(
    range(len(importances)),
    [
        "Vegetation", "Slope", "Profile convexity", "Entropy",
        "Total street intersections", "Total street length",
        "Mean connectivity", "Minimum connectivity", "Maximum connectivity"
    ],
    fontsize=11
)
plt.xticks(fontsize=11)
plt.xlabel("Feature Importance", fontsize=12)
plt.ylabel("Features", fontsize=12)
plt.tight_layout()
plt.savefig("feature_importances_horizontal.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
## Simple cross-validation 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.model_selection import cross_validate
import numpy as np

# Feature columns used for training
f_cols = [
    "vegetation", "slope", "profile_co", "entropy",
    "nodes", "roads", "mean_conne", "min_connex", "max_connex"
]

# Extract features and labels
features = np.array(dataset[f_cols].values)
labels = np.array(dataset["label"].values)

# Separate by class
class_0 = features[labels == 0]
class_1 = features[labels == 1]

# Balance dataset by downsampling the majority class
if len(class_0) > len(class_1):
    class_0_downsampled = resample(
        class_0, replace=False, n_samples=len(class_1), random_state=42
    )
    features_balanced = np.vstack([class_0_downsampled, class_1])
    labels_balanced = np.hstack([
        np.zeros(len(class_0_downsampled)),
        np.ones(len(class_1))
    ])
else:
    class_1_downsampled = resample(
        class_1, replace=False, n_samples=len(class_0), random_state=42
    )
    features_balanced = np.vstack([class_0, class_1_downsampled])
    labels_balanced = np.hstack([
        np.zeros(len(class_0)),
        np.ones(len(class_1_downsampled))
    ])

# Shuffle data to remove ordering bias
shuffle_idx = np.random.permutation(len(labels_balanced))
features_balanced = features_balanced[shuffle_idx]
labels_balanced = labels_balanced[shuffle_idx]

# Initialize classifier
model = RandomForestClassifier(random_state=42)

# Define evaluation metrics
scoring = {
    "f1": "f1",
    "precision": "precision",
    "recall": "recall"
}

# Perform 5-fold cross-validation
scores = cross_validate(model, features_balanced, labels_balanced, cv=5, scoring=scoring)

# Display results
print(f"Precision: {np.mean(scores['test_precision']):.2f} ± {np.std(scores['test_precision']):.2f}")
print(f"Recall:    {np.mean(scores['test_recall']):.2f} ± {np.std(scores['test_recall']):.2f}")
print(f"F1-score:  {np.mean(scores['test_f1']):.2f} ± {np.std(scores['test_f1']):.2f}")

In [None]:
## Spatial cross-validation 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

# Get the list of unique zones and reverse order
zones = dataset["zone"].unique().tolist()
zones.reverse()

# Feature columns
f_cols = [
    "vegetation", "slope", "profile_co", "entropy",
    "nodes", "roads", "mean_conne", "min_connex", "max_connex"
]

folds = []

# Perform stratified sampling for each zone
for zone_id in zones:
    dataset_zone = dataset[dataset["zone"] == zone_id]
    X, y = dataset_zone[f_cols].values, dataset_zone["label"].values

    class_0 = X[y == 0]
    class_1 = X[y == 1]

    # Balance classes by downsampling
    if len(class_0) > len(class_1):
        class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1))
        X_balanced = np.vstack([class_0_downsampled, class_1])
        y_balanced = np.hstack([
            np.zeros(len(class_0_downsampled)),
            np.ones(len(class_1))
        ])
    else:
        class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0))
        X_balanced = np.vstack([class_0, class_1_downsampled])
        y_balanced = np.hstack([
            np.zeros(len(class_0)),
            np.ones(len(class_1_downsampled))
        ])

    # Shuffle to remove order bias
    p = np.random.permutation(len(y_balanced))
    X_balanced, y_balanced = X_balanced[p], y_balanced[p]

    folds.append([X_balanced, y_balanced])

# Initialize metric lists
f1_scores, precision_scores, recall_scores = [], [], []

# Cross-validation by leave-one-zone-out
for i in range(len(folds)):
    X_test, y_test = folds[i]
    X_train = np.vstack([fold[0] for j, fold in enumerate(folds) if j != i])
    y_train = np.hstack([fold[1] for j, fold in enumerate(folds) if j != i])

    # Train classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    print(f"Zone {i + 1}: Precision={precision:.2f}, Recall={recall:.2f}, F1={f1:.2f}")

# Global performance summary
print("\nOverall performance:")
print(f"Precision: {np.mean(precision_scores):.2f} ± {np.std(precision_scores):.2f}")
print(f"Recall:    {np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}")
print(f"F1-score:  {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")

In [None]:
## Multiple spatial cross-validations 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score
import numpy as np

# Get the list of unique zones and reverse the order
zones = dataset["zone"].unique().tolist()
zones.reverse()

# Feature columns
f_cols = [
    "vegetation", "slope", "profile_co", "entropy", 
    "nodes", "roads", "mean_conne", "min_connex", "max_connex"
] 

# Initialize lists to store evaluation metrics for each zone
f1_scores = [[] for _ in range(len(zones))]
precision_scores = [[] for _ in range(len(zones))]
recall_scores = [[] for _ in range(len(zones))]
kappa_scores = [[] for _ in range(len(zones))]

# Perform 10 rounds of cross-validation
for _ in range(10):
    
    folds = []
    
    for z in zones:
        dataset_zone = dataset[dataset["zone"] == z]
        X, y = dataset_zone[f_cols].values, dataset_zone["label"].values
        
        class_0 = X[y == 0]
        class_1 = X[y == 1]
        
        # Balance classes using downsampling
        if len(class_0) > len(class_1):
            class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1))
            X_balanced = np.vstack([class_0_downsampled, class_1])
            y_balanced = np.hstack([np.zeros(len(class_0_downsampled)), np.ones(len(class_1))])
        else:
            class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0))
            X_balanced = np.vstack([class_0, class_1_downsampled])
            y_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(class_1_downsampled))])
            
        # Shuffle the data to avoid order bias
        p = np.random.permutation(len(y_balanced))
        X_balanced, y_balanced = X_balanced[p], y_balanced[p]
        
        folds.append([X_balanced, y_balanced])

    # Train and test the model for each fold
    for i in range(len(folds)):
        X_test, y_test = folds[i][0], folds[i][1]
        
        # Use all other folds as training data
        X_train = np.vstack([fold[0] for j, fold in enumerate(folds) if j != i])
        y_train = np.hstack([fold[1] for j, fold in enumerate(folds) if j != i])
        
        # Train the classifier
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)

        # Compute evaluation metrics
        precision_scores[i].append(precision_score(y_test, y_pred))
        recall_scores[i].append(recall_score(y_test, y_pred))
        f1_scores[i].append(f1_score(y_test, y_pred))
        kappa_scores[i].append(cohen_kappa_score(y_test, y_pred))

# Display performance metrics for each zone
for i in range(len(zones)):
    print(f"Zone {i+1} - Precision: {np.mean(precision_scores[i]):.2f} ± {np.std(precision_scores[i]):.2f}")
    print(f"Zone {i+1} - Recall: {np.mean(recall_scores[i]):.2f} ± {np.std(recall_scores[i]):.2f}")
    print(f"Zone {i+1} - F1-score: {np.mean(f1_scores[i]):.2f} ± {np.std(f1_scores[i]):.2f}")
    print(f"Zone {i+1} - Kappa: {np.mean(kappa_scores[i]):.2f} ± {np.std(kappa_scores[i]):.2f}\n")

# Compute overall performance metrics
print(f"Overall Precision: {np.mean([np.mean(f) for f in precision_scores]):.2f} ± {np.std([np.mean(f) for f in precision_scores]):.2f}")
print(f"Overall Recall: {np.mean([np.mean(f) for f in recall_scores]):.2f} ± {np.std([np.mean(f) for f in recall_scores]):.2f}")
print(f"Overall F1-score: {np.mean([np.mean(f) for f in f1_scores]):.2f} ± {np.std([np.mean(f) for f in f1_scores]):.2f}")
print(f"Overall Kappa: {np.mean([np.mean(f) for f in kappa_scores]):.2f} ± {np.std([np.mean(f) for f in kappa_scores]):.2f}")

In [None]:
## Ensemble approach with spatial cross-validation  

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, precision_score, recall_score
from tqdm import tqdm

# Get the unique zones and reverse the order
zones = dataset['zone'].unique().tolist()
zones.reverse()

# Initialize lists to store scores
f1_scores = [[] for _ in range(len(zones))]
precision_scores = [[] for _ in range(len(zones))]
recall_scores = [[] for _ in range(len(zones))]

# Store predictions for ensemble evaluation
preds = []

# Feature columns
feature_cols = [
    'vegetation', 'slope', 'profile_co', 'entropy', 'nodes',
    'roads', 'mean_conne', 'min_connex', 'max_connex'
]

# Iterate over each zone for spatial cross-validation
for test_zone in zones:
    
    # Split dataset into training and testing based on zones
    train_dataset = dataset[dataset['zone'] != test_zone]
    test_dataset = dataset[dataset['zone'] == test_zone]

    X_train, y_train = train_dataset[feature_cols].values, train_dataset['label'].values
    X_test, y_test = test_dataset[feature_cols].values, test_dataset['label'].values
    test_ids = test_dataset['id'].values

    # Identify the two classes in the test set
    class_0 = X_test[y_test == 0]
    class_1 = X_test[y_test == 1]

    ids_class_0 = test_ids[y_test == 0]
    ids_class_1 = test_ids[y_test == 1]

    # Balance the test dataset through downsampling
    if len(class_0) > len(class_1):
        X_class_0_downsampled, ids_class_0_downsampled = resample(
            class_0, ids_class_0, replace=False, n_samples=len(class_1)
        )
        X_test_balanced = np.vstack([X_class_0_downsampled, class_1])
        y_test_balanced = np.hstack([np.zeros(len(X_class_0_downsampled)), np.ones(len(class_1))])
        ids_test_balanced = np.hstack([ids_class_0_downsampled, ids_class_1])
    else:
        X_class_1_downsampled, ids_class_1_downsampled = resample(
            class_1, ids_class_1, replace=False, n_samples=len(class_0)
        )
        X_test_balanced = np.vstack([class_0, X_class_1_downsampled])
        y_test_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(X_class_1_downsampled))])
        ids_test_balanced = np.hstack([ids_class_0, ids_class_1_downsampled])

    # Shuffle the test dataset
    p = np.random.permutation(len(y_test_balanced))
    X_test_balanced, y_test_balanced, ids_test_balanced = X_test_balanced[p], y_test_balanced[p], ids_test_balanced[p]
    
    # Store test IDs alongside actual labels
    ids_test_balanced = np.column_stack((ids_test_balanced, y_test_balanced))

    # Train 100 models for the ensemble approach
    for _ in tqdm(range(100), desc=f"Zone {int(test_zone)}"):
        
        # Balance the training dataset
        class_0 = X_train[y_train == 0]
        class_1 = X_train[y_train == 1]

        if len(class_0) > len(class_1):
            class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1))
            X_train_balanced = np.vstack([class_0_downsampled, class_1])
            y_train_balanced = np.hstack([np.zeros(len(class_0_downsampled)), np.ones(len(class_1))])
        else:
            class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0))
            X_train_balanced = np.vstack([class_0, class_1_downsampled])
            y_train_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(class_1_downsampled))])

        # Shuffle the training dataset
        p = np.random.permutation(len(y_train_balanced))
        X_train_balanced, y_train_balanced = X_train_balanced[p], y_train_balanced[p]

        # Train the model
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train_balanced, y_train_balanced)

        # Predict on the balanced test set
        y_pred = model.predict(X_test_balanced)

        # Store predictions for ensemble evaluation
        ids_test_balanced = np.column_stack((ids_test_balanced, y_pred))

        # Compute and store evaluation metrics
        f1_scores[int(test_zone - 1)].append(f1_score(y_test_balanced, y_pred))
        precision_scores[int(test_zone - 1)].append(precision_score(y_test_balanced, y_pred))
        recall_scores[int(test_zone - 1)].append(recall_score(y_test_balanced, y_pred))

    preds.append(ids_test_balanced)

    # Print per-zone results
    print(f"Average Precision (Zone {int(test_zone)}): {np.mean(precision_scores[int(test_zone - 1)]):.2f} +/- {np.std(precision_scores[int(test_zone - 1)]):.2f}")
    print(f"Average Recall (Zone {int(test_zone)}): {np.mean(recall_scores[int(test_zone - 1)]):.2f} +/- {np.std(recall_scores[int(test_zone - 1)]):.2f}")
    print(f"Average F1-score (Zone {int(test_zone)}): {np.mean(f1_scores[int(test_zone - 1)]):.2f} +/- {np.std(f1_scores[int(test_zone - 1)]):.2f}\n")

# Print overall metrics
print(f"Precision: {np.mean([np.mean(f) for f in precision_scores]):.2f} +/- {np.std([np.mean(f) for f in precision_scores]):.2f}")
print(f"Recall: {np.mean([np.mean(f) for f in recall_scores]):.2f} +/- {np.std([np.mean(f) for f in recall_scores]):.2f}")
print(f"F1-score: {np.mean([np.mean(f) for f in f1_scores]):.2f} +/- {np.std([np.mean(f) for f in f1_scores]):.2f}\n")

print("----- Final Ensemble Evaluation -----\n")

# Ensemble evaluation
ens_recall_scores = []
ens_precision_scores = []
ens_f1_scores = []

for i in zones:
    i = int(i - 1)
    ids_i = preds[i][:, 0]
    labels_i = preds[i][:, 1]
    preds_i = preds[i][:, 2:]

    # Use majority voting (if the average prediction is ≥ 0.5, classify as 1)
    final_preds_i = np.array([1 if np.mean(l) >= 0.5 else 0 for l in preds_i])

    # Compute final ensemble metrics
    ens_precision_scores.append(precision_score(labels_i, final_preds_i))
    ens_recall_scores.append(recall_score(labels_i, final_preds_i))
    ens_f1_scores.append(f1_score(labels_i, final_preds_i))

    print(f"Precision (Zone {i+1}): {precision_score(labels_i, final_preds_i):.2f}")
    print(f"Recall (Zone {i+1}): {recall_score(labels_i, final_preds_i):.2f}")
    print(f"F1-score (Zone {i+1}): {f1_score(labels_i, final_preds_i):.2f}\n")

# Print overall ensemble results
print(f"Final Precision: {np.mean(ens_precision_scores):.2f} +/- {np.std(ens_precision_scores):.2f}")
print(f"Final Recall: {np.mean(ens_recall_scores):.2f} +/- {np.std(ens_recall_scores):.2f}")
print(f"Final F1-score: {np.mean(ens_f1_scores):.2f} +/- {np.std(ens_f1_scores):.2f}\n")

In [None]:
## Ensemble approach with multiple spatial cross-validations 

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score
from tqdm import tqdm

# Get the unique zones and reverse their order
zones = dataset['zone'].unique().tolist()
zones.reverse()

# Feature columns
feature_cols = [
    'vegetation', 'slope', 'profile_co', 'entropy', 'nodes',
    'roads', 'mean_conne', 'min_connex', 'max_connex'
]

# Initialize lists to store final metrics across all cross-validations
final_f1_scores = [[] for _ in range(len(zones))]
final_precision_scores = [[] for _ in range(len(zones))]
final_recall_scores = [[] for _ in range(len(zones))]
final_kappa_scores = [[] for _ in range(len(zones))]

# Perform 10 rounds of cross-validation
for n in range(10):

    print(f"----- Cross-Validation Round {n+1} -----\n")

    # Initialize lists to store per-zone metrics for this round
    f1_scores = [[] for _ in range(len(zones))]
    precision_scores = [[] for _ in range(len(zones))]
    recall_scores = [[] for _ in range(len(zones))]
    kappa_scores = [[] for _ in range(len(zones))]

    # Store predictions for ensemble evaluation
    preds = []

    # Iterate over each zone for spatial cross-validation
    for test_zone in zones:

        # Split dataset into training and testing sets
        train_dataset = dataset[dataset['zone'] != test_zone]
        test_dataset = dataset[dataset['zone'] == test_zone]

        X_train, y_train = train_dataset[feature_cols].values, train_dataset['label'].values
        X_test, y_test = test_dataset[feature_cols].values, test_dataset['label'].values
        test_ids = test_dataset['id'].values

        # Separate test samples by class
        class_0 = X_test[y_test == 0]
        class_1 = X_test[y_test == 1]

        ids_class_0 = test_ids[y_test == 0]
        ids_class_1 = test_ids[y_test == 1]

        # Balance the test set through downsampling
        if len(class_0) > len(class_1):
            X_class_0_downsampled, ids_class_0_downsampled = resample(
                class_0, ids_class_0, replace=False, n_samples=len(class_1)
            )
            X_test_balanced = np.vstack([X_class_0_downsampled, class_1])
            y_test_balanced = np.hstack([np.zeros(len(X_class_0_downsampled)), np.ones(len(class_1))])
            ids_test_balanced = np.hstack([ids_class_0_downsampled, ids_class_1])
        else:
            X_class_1_downsampled, ids_class_1_downsampled = resample(
                class_1, ids_class_1, replace=False, n_samples=len(class_0)
            )
            X_test_balanced = np.vstack([class_0, X_class_1_downsampled])
            y_test_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(X_class_1_downsampled))])
            ids_test_balanced = np.hstack([ids_class_0, ids_class_1_downsampled])

        # Shuffle the test dataset
        p = np.random.permutation(len(y_test_balanced))
        X_test_balanced, y_test_balanced, ids_test_balanced = X_test_balanced[p], y_test_balanced[p], ids_test_balanced[p]

        # Store test IDs alongside actual labels
        ids_test_balanced = np.column_stack((ids_test_balanced, y_test_balanced))

        # Train 100 models for the ensemble approach
        for _ in tqdm(range(100), desc=f"Zone {int(test_zone)}"):

            # Balance the training dataset
            class_0 = X_train[y_train == 0]
            class_1 = X_train[y_train == 1]

            if len(class_0) > len(class_1):
                class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1))
                X_train_balanced = np.vstack([class_0_downsampled, class_1])
                y_train_balanced = np.hstack([np.zeros(len(class_0_downsampled)), np.ones(len(class_1))])
            else:
                class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0))
                X_train_balanced = np.vstack([class_0, class_1_downsampled])
                y_train_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(class_1_downsampled))])

            # Shuffle the training dataset
            p = np.random.permutation(len(y_train_balanced))
            X_train_balanced, y_train_balanced = X_train_balanced[p], y_train_balanced[p]

            # Train the model
            model = RandomForestClassifier(n_estimators=100)
            model.fit(X_train_balanced, y_train_balanced)

            # Predict on the balanced test set
            y_pred = model.predict(X_test_balanced)

            # Store predictions for ensemble evaluation
            ids_test_balanced = np.column_stack((ids_test_balanced, y_pred))

            # Compute and store evaluation metrics
            f1_scores[int(test_zone - 1)].append(f1_score(y_test_balanced, y_pred))
            precision_scores[int(test_zone - 1)].append(precision_score(y_test_balanced, y_pred))
            recall_scores[int(test_zone - 1)].append(recall_score(y_test_balanced, y_pred))
            kappa_scores[int(test_zone - 1)].append(cohen_kappa_score(y_test_balanced, y_pred))

        preds.append(ids_test_balanced)

    print("\n")

    # Ensemble evaluation
    ens_recall_scores = []
    ens_precision_scores = []
    ens_f1_scores = []
    ens_kappa_scores = []

    for i in zones:
        i = int(i - 1)
        ids_i = preds[i][:, 0]
        labels_i = preds[i][:, 1]
        preds_i = preds[i][:, 2:]

        # Use majority voting for final ensemble prediction
        final_preds_i = np.array([1 if np.mean(l) >= 0.5 else 0 for l in preds_i])

        # Compute final ensemble metrics
        ens_precision_scores.append(precision_score(labels_i, final_preds_i))
        ens_recall_scores.append(recall_score(labels_i, final_preds_i))
        ens_f1_scores.append(f1_score(labels_i, final_preds_i))
        ens_kappa_scores.append(cohen_kappa_score(labels_i, final_preds_i))

        final_precision_scores[i].append(precision_score(labels_i, final_preds_i))
        final_recall_scores[i].append(recall_score(labels_i, final_preds_i))
        final_f1_scores[i].append(f1_score(labels_i, final_preds_i))
        final_kappa_scores[i].append(cohen_kappa_score(labels_i, final_preds_i))

    # Print per-zone ensemble results
    print(f"Precision: {np.mean(ens_precision_scores):.2f} ± {np.std(ens_precision_scores):.2f}")
    print(f"Recall: {np.mean(ens_recall_scores):.2f} ± {np.std(ens_recall_scores):.2f}")
    print(f"F1-score: {np.mean(ens_f1_scores):.2f} ± {np.std(ens_f1_scores):.2f}")
    print(f"Kappa: {np.mean(ens_kappa_scores):.2f} ± {np.std(ens_kappa_scores):.2f}\n")

print("----- Summary -----\n")

# Print final metrics for each zone
for i in range(len(zones)):
    print(f"Zone {i+1} Precision: {np.mean(final_precision_scores[i]):.2f} ± {np.std(final_precision_scores[i]):.2f}")
    print(f"Zone {i+1} Recall: {np.mean(final_recall_scores[i]):.2f} ± {np.std(final_recall_scores[i]):.2f}")
    print(f"Zone {i+1} F1-score: {np.mean(final_f1_scores[i]):.2f} ± {np.std(final_f1_scores[i]):.2f}")
    print(f"Zone {i+1} Kappa: {np.mean(final_kappa_scores[i]):.2f} ± {np.std(final_kappa_scores[i]):.2f}\n")

In [None]:
## Sensitivity analysis 

In [None]:
import geopandas as gpd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score
import matplotlib.pyplot as plt
from matplotlib import font_manager

# Load the grid shapefile
grid = gpd.read_file("/path/to/your/data/data.gpkg")

# Lists to store sensitivity results
f1_scores_sensib = []
precision_scores_sensib = []
recall_scores_sensib = []
kappa_scores_sensib = []

# Loop over different threshold values
for p in [i / 10 for i in range(1, 10)]:
    # Define labels based on varying favela coverage thresholds
    grid["label"] = np.where(
        (grid["vegetation"] <= 0.95)
        & (grid["ghsl"] >= 0.5)
        & (grid["osm"] <= 0.5)
        & (grid["favelas"] > p),
        1,
        np.where(
            (grid["vegetation"] <= 0.95)
            & (grid["ghsl"] >= 0.5)
            & (grid["osm"] <= 0.5)
            & (grid["favelas"] == 0),
            0,
            np.nan,
        ),
    )

    dataset = grid[grid["label"].notna()].copy()

    # Load zones shapefile
    zones = gpd.read_file("/path/to/your/data/zones.shp")

    # Assign each cell to a zone based on centroid location
    dataset["centroid"] = dataset.geometry.centroid
    points_zones = gpd.sjoin(
        dataset.set_geometry("centroid"),
        zones[["fid", "geometry"]],
        how="left",
        predicate="within",
    )
    dataset["zone"] = points_zones["fid"]
    dataset = dataset.drop(columns=["centroid"])
    dataset = dataset[dataset["zone"].notna()]

    zones = dataset["zone"].unique().tolist()
    zones.reverse()

    # Feature columns
    f_cols = [
        "vegetation", "slope", "profile_co", "entropy",
        "nodes", "roads", "mean_conne", "min_connex", "max_connex"
    ]

    # Initialize performance metric lists
    f1_scores = [[] for _ in range(len(zones))]
    precision_scores = [[] for _ in range(len(zones))]
    recall_scores = [[] for _ in range(len(zones))]
    kappa_scores = [[] for _ in range(len(zones))]

    # Perform 5-fold cross-validation
    for _ in range(5):
        folds = []

        for zone_id in zones:
            dataset_zone = dataset[dataset["zone"] == zone_id]
            X, y = dataset_zone[f_cols].values, dataset_zone["label"].values

            class_0 = X[y == 0]
            class_1 = X[y == 1]

            # Balance the dataset
            if len(class_0) > len(class_1):
                class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1))
                X_balanced = np.vstack([class_0_downsampled, class_1])
                y_balanced = np.hstack([
                    np.zeros(len(class_0_downsampled)),
                    np.ones(len(class_1)),
                ])
            else:
                class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0))
                X_balanced = np.vstack([class_0, class_1_downsampled])
                y_balanced = np.hstack([
                    np.zeros(len(class_0)),
                    np.ones(len(class_1_downsampled)),
                ])

            # Shuffle dataset
            perm_idx = np.random.permutation(len(y_balanced))
            X_balanced, y_balanced = X_balanced[perm_idx], y_balanced[perm_idx]

            folds.append([X_balanced, y_balanced])

        # Cross-validation training and evaluation
        for i in range(len(folds)):
            X_test, y_test = folds[i]
            X_train = np.vstack([fold[0] for j, fold in enumerate(folds) if j != i])
            y_train = np.hstack([fold[1] for j, fold in enumerate(folds) if j != i])

            model = RandomForestClassifier(random_state=None)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            precision_scores[i].append(precision_score(y_test, y_pred))
            recall_scores[i].append(recall_score(y_test, y_pred))
            f1_scores[i].append(f1_score(y_test, y_pred))
            kappa_scores[i].append(cohen_kappa_score(y_test, y_pred))

    # Compute average metrics for current threshold
    mean_precision = np.mean([np.mean(f) for f in precision_scores])
    mean_recall = np.mean([np.mean(f) for f in recall_scores])
    mean_f1 = np.mean([np.mean(f) for f in f1_scores])
    mean_kappa = np.mean([np.mean(f) for f in kappa_scores])

    print(f"Threshold {p:.1f}")
    print(f"Precision: {mean_precision:.2f}")
    print(f"Recall: {mean_recall:.2f}")
    print(f"F1-score: {mean_f1:.2f}")
    print(f"Kappa: {mean_kappa:.2f}\n")

    # Store results for sensitivity curve
    precision_scores_sensib.append(mean_precision)
    recall_scores_sensib.append(mean_recall)
    f1_scores_sensib.append(mean_f1)
    kappa_scores_sensib.append(mean_kappa)

# --- Plot sensitivity analysis ---
font_path = "/usr/share/fonts/truetype/cmu/cmunrm.ttf"
font_manager.fontManager.addfont(font_path)
font_prop = font_manager.FontProperties(fname=font_path)

plt.rcParams["font.family"] = font_prop.get_name()
plt.figure(figsize=(6, 4))
plt.plot(range(10, 91, 10), f1_scores_sensib, marker="o", linestyle="-", label="F1-score")
plt.plot(range(10, 91, 10), kappa_scores_sensib, marker="o", linestyle="-", label="Kappa")

plt.xlabel("Favela Coverage Proportion (%)", fontsize=12)
plt.ylabel("Performance Metrics", fontsize=12)
plt.legend(loc="lower right", fontsize=11)
plt.grid()
plt.tight_layout()
plt.savefig("sensitivity_analysis.png", dpi=400, bbox_inches="tight")
plt.show()