In [105]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from skimage.filters import threshold_otsu
from scipy.stats import zscore
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

In [106]:
# Paths
cwd = os.path.join("..","..")
cwd_Images_Raw = os.path.join(cwd, "Sentinel-2 Images Raw")
cwd_Images_Processed = os.path.join(cwd, "Sentinel-2 Images Processed")
cwd_Images_Results = os.path.join(cwd, "Results")

In [107]:
# Read different sheets from the excel file
df_HI = pd.read_excel(os.path.join(cwd_Images_Results,"Final (101 Sites) HI, Roman, Info.xlsx"), sheet_name="Homogeneity", header = 1)
df_Roman = pd.read_excel(os.path.join(cwd_Images_Results,"Final (101 Sites) HI, Roman, Info.xlsx"), sheet_name="Representativeness")
df = pd.merge(df_Roman,df_HI[['Site','CV 900']],on='Site')
df = df[['Site','Spatial Representativeness','RAW Score','ST Score','Roman Metrics','CV 900']].sort_values(['Site','Spatial Representativeness']).reset_index(drop = True)
df.head()

Unnamed: 0,Site,Spatial Representativeness,RAW Score,ST Score,Roman Metrics,CV 900
0,AT-Mmg,100-300,9.436081,0.676006,RAW,0.065007
1,AT-Mmg,100-600,27.656817,0.433034,RAW,0.065007
2,AT-Mmg,100-900,5.195374,0.259116,RAW,0.065007
3,AT-Mmg,300-600,13.563862,0.734345,RAW,0.065007
4,AT-Mmg,300-900,3.173046,0.466587,RAW,0.065007


In [108]:
list_ROI = ['100-300','100-600','100-900','300-600','300-900']
list_score = ['RAW Score','ST Score']
list_ROI_alt = []
list_score_alt = []
for ROI in list_ROI:
    for score in list_score:
        list_ROI_alt.append(ROI)
        list_score_alt.append(score)

In [109]:
# Distribution plot
for ROI in list_ROI:
    for score in list_score:
        # Read column
        if score == 'RAW Score':
            data = np.array(df[df['Spatial Representativeness'] == ROI][score])
        if score == 'ST Score':
            data = np.array(df[(df['Spatial Representativeness'] == ROI)&(df['Roman Metrics'] == 'RAW or ST')][score])
        z_scores = zscore(data)

        # Define a threshold for outliers (e.g., 3 standard deviations)
        threshold = 2
        # Filter the data by keeping only the values within the threshold
        data = data[np.abs(z_scores) < threshold]

        # Set the style for Seaborn
        sns.set_theme(style="whitegrid")

        # Create a histogram with specific bin edges
        bin_edges = np.linspace(0, round(np.max(data)), 11)  # Define bin edges (e.g., 10 bins)

        # Plot histogram with custom bin edges
        plt.figure(figsize=(8, 6))
        sns.histplot(data, bins=bin_edges, kde=True, color='skyblue', edgecolor='black')

        # Add grid
        plt.grid(True, axis='x', linestyle='--', color='gray')

        # Set x-ticks to correspond to bin edges
        plt.xticks(bin_edges)

        # Add labels and title
        plt.title(f'Distribution of {ROI} | {score}', fontsize=16)
        plt.xlabel('Value', fontsize=14)
        plt.ylabel('Frequency', fontsize=14)

        plt.savefig(os.path.join(cwd_Images_Results,"Plots","SR Distribution",f"{ROI} {score}.png"))
        plt.clf()

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

In [110]:
# Median thresholds of RAW and ST
list_median_threshold = []
for ROI in list_ROI:
    for score in list_score:
        # Read column
        if score == 'RAW Score':
            data = np.array(df[df['Spatial Representativeness'] == ROI][score])
        if score == 'ST Score':
            data = np.array(df[(df['Spatial Representativeness'] == ROI)&(df['Roman Metrics'] == 'RAW or ST')][score])
        median_threshold = np.median(data)
        list_median_threshold.append(median_threshold)
df_median_threshold = pd.DataFrame({
    "ROI": list_ROI_alt,
    "Score": list_score_alt,
    "Median Threshold": list_median_threshold
}).to_csv(os.path.join(cwd_Images_Results,"SP - Median Thresholds.csv"), index = False)

In [111]:
# otsu of ST 
list_otsu_threshold = []
for ROI in list_ROI:
    for score in list_score:
        # Read column
        if score == 'RAW Score':
            list_otsu_threshold.append(None)
        else:
            data = np.array(df[(df['Spatial Representativeness'] == ROI)&(df['Roman Metrics'] == 'RAW or ST')][score])
            otsu_threshold = threshold_otsu(data)
            list_otsu_threshold.append(otsu_threshold)
df_otsu_threshold = pd.DataFrame({
    "ROI": list_ROI_alt,
    "Score": list_score_alt,
    "Otsu Threshold": list_otsu_threshold
}).to_csv(os.path.join(cwd_Images_Results,"SP - Otsu Thresholds.csv"), index = False)

In [112]:
# DBSCAN of RAW and ST
list_DBSCAN_threshold = []
for ROI in list_ROI:
    for score in list_score:
        if score == 'RAW Score':
            data = np.array(df[df['Spatial Representativeness'] == ROI][score])
        if score == 'ST Score':
            data = np.array(df[(df['Spatial Representativeness'] == ROI)&(df['Roman Metrics'] == 'RAW or ST')][score])
            list_DBSCAN_threshold.append(None)
        data = data.reshape(-1,1)

        # Apply DBSCAN (you may need to adjust eps and min_samples)
        dbscan = DBSCAN(eps=0.1, min_samples=5)  # You may need to adjust these values
        labels = dbscan.fit_predict(data)

        # Visualize the clusters
        plt.scatter(range(len(data)), data, c=labels, cmap='viridis')
        plt.xlabel('Index')
        plt.ylabel('Value')
        plt.title(f'DBSCAN of {ROI} | {score}')
        plt.savefig(os.path.join(cwd_Images_Results,"Plots","SR DBSCAN",f"{ROI} {score}.png"))
        plt.clf()
        
        if score == 'RAW Score':
            # Combine data and labels
            df_temp = pd.DataFrame({'value': data.flatten(), 'cluster': labels})
            cluster_max_values = (
                df_temp[df_temp['cluster'] != -1]  # Exclude noise points
                .groupby('cluster')['value']
                .max()
            )

            # Get the minimum value of the maximum values
            min_of_max = cluster_max_values.min()
            list_DBSCAN_threshold.append(min_of_max)

        # # Get the cluster labels
        # labels = dbscan.labels_

        # # Separate data points into clusters based on the labels
        # unique_labels = set(labels)  # Get unique cluster labels, including noise (-1)

        # clusters = {}
        # for label in unique_labels:
        #     # Extract data points that belong to the current label (cluster)
        #     clusters[label] = data[labels == label]

        # # Display data points in each cluster
        # for label, cluster_data in clusters.items():
        #     if label == -1:
        #         print(f"Outliers (Noise): {cluster_data.flatten()}")
        #     else:
        #         print(f"Cluster {label}: {cluster_data.flatten()}")

df_DBSCAN_threshold = pd.DataFrame({
    "ROI": list_ROI_alt,
    "Score": list_score_alt,
    "DBSCAN Threshold": list_DBSCAN_threshold
}).to_csv(os.path.join(cwd_Images_Results,"SP - DBSCAN Thresholds.csv"), index = False)

<Figure size 640x480 with 0 Axes>

In [116]:
df_threshold = pd.DataFrame({
    "ROI": list_ROI_alt,
    "Score": list_score_alt,
    "Median Threshold": list_median_threshold,
    "Otsu Threshold": list_otsu_threshold,
    "DBSCAN Threshold": list_DBSCAN_threshold
})
df_threshold.to_csv(os.path.join(cwd_Images_Results,"SP - AIO Thresholds.csv"), index = False)
df_threshold

Unnamed: 0,ROI,Score,Median Threshold,Otsu Threshold,DBSCAN Threshold
0,100-300,RAW Score,1.543226,,0.837649
1,100-300,ST Score,0.667581,0.668609,
2,100-600,RAW Score,0.825489,,0.615398
3,100-600,ST Score,0.429721,0.533471,
4,100-900,RAW Score,0.496673,,1.153229
5,100-900,ST Score,0.350794,0.413828,
6,300-600,RAW Score,3.050407,,0.485376
7,300-600,ST Score,0.739677,0.682778,
8,300-900,RAW Score,1.310907,,1.257602
9,300-900,ST Score,0.592676,0.57361,
