#  <u>Exploring clusters in 500m buffers with KMeans</u> - clean and running

In [None]:
!pip install yellowbrick

In [None]:
# Importing Python packages and modules
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import folium
import warnings
import os
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from matplotlib import colors


In [None]:
# Buffer count with lat and long
buffer_metrics_lat_long = gpd.read_file("buffer_metrics_lat_long.csv")

#### Exploring the buffer metrics

In [None]:
#Checking buffer sizes and random and strandings points
buffer_metrics_lat_long.sample(10)

In [None]:
buffer_metrics_lat_long.info()
buffer_metrics_lat_long.describe()

## Looking at 500m buffer with strandings only
#### See 500m Buffer exploration notebook for why 500m buffer

In [None]:
# Filtering for 500m and strandings points only
buffer_metrics_500 = buffer_metrics_lat_long[
    (buffer_metrics_lat_long['Data'] != 'Random points') &
    (buffer_metrics_lat_long['Buffer size'] == "500m")].copy()

In [None]:
#Checking it's filtered
buffer_metrics_500.sample(10)

In [None]:
#Checking for nulls
buffer_metrics_500.info()
#Checking unique buffer size and data both equal 1
buffer_metrics_500.describe()

In [None]:
# Converting necessary columns to numeric
buffer_metrics_500['latitude'] = pd.to_numeric(buffer_metrics_500['latitude'], errors='coerce')
buffer_metrics_500['longitude'] = pd.to_numeric(buffer_metrics_500['longitude'], errors='coerce')
buffer_metrics_500['Road length'] = pd.to_numeric(buffer_metrics_500['Road length'], errors='coerce')
buffer_metrics_500['Building count'] = pd.to_numeric(buffer_metrics_500['Building count'], errors='coerce')
buffer_metrics_500['Bathymetry mean'] = pd.to_numeric(buffer_metrics_500['Bathymetry mean'], errors='coerce')
buffer_metrics_500['Other points'] = pd.to_numeric(buffer_metrics_500['Other points'], errors='coerce')

----------------------------------------------------------------------------------------------------------------------------

#### Log-transfrom

In [None]:
#creating a copy of buffer_metrics
df_log = buffer_metrics_500.copy()

In [None]:
#Log-transforming building count and road lenght to reduce skewness
df_log['Building_trans'] = np.log1p(df_log['Building count'])
df_log['Road_trans'] = np.log1p(df_log['Road length'])
df_log['Bathymetry_trans'] = np.log1p(df_log['Bathymetry mean'])
df_log['Other_points_trans'] = np.log1p(df_log['Other points'])

#### Scale Features

#### I chose to use Robust Scaler because it handles outliers well while keeping scale consistency.

In [None]:
# Convert the ndarray back to a pandas DataFrame
df_log = pd.DataFrame(df_log, columns=['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans', 'latitude', 'longitude'])

# Now, you can scale the features
features_to_scale = ['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans', 'latitude', 'longitude']
scaler = RobustScaler()
df_log[features_to_scale] = scaler.fit_transform(df_log[features_to_scale])

In [None]:
df_log.head()

In [None]:
# Merging the log-transformed and scaled columns back into the orgainal DataFrame
cols_to_add = ['Data', 'Buffer size', 'Point ID', 'Other points', 'Road length', 'Building count', 'Bathymetry mean']

df_log = pd.concat(
    [buffer_metrics_500[cols_to_add].reset_index(drop=True),
        df_log.reset_index(drop=True)],
    axis=1)


In [None]:
df_log.sample(10)

In [None]:
df_log.shape
df_log.info()

In [None]:
df_log.describe()

-------------------------------------------------------------------------------------------------------------------------------------------------------

# KElbowVisualizer

In [None]:
#Surpressing memory warnings
warnings.filterwarnings("ignore", message=".*KMeans is known to have a memory leak.*")

# KMeans ++ , random state = 0 to ensure reproducibility, running 10 times
model = KMeans(init='k-means++', random_state=0, n_init=10)
#Setting Elbow Vi
visualizer = KElbowVisualizer(model, k=(1, 12))
X = df_log[['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans', 'latitude', 'longitude']].dropna()
#Fitting teh visulizer
visualizer.fit(X)
#Showing the plot
plt.savefig("Method_results_images/K_Means_KelbowVisualizer.png", dpi=150, bbox_inches="tight")
visualizer.show()


#### Elbow point at k=4, with a distortion score of 1896.828

In [None]:
print(X.isna().sum())

-------------------------------------------------------------------------------------------------------------------------------------------------------

# Mini Batch KMeans

In [None]:
# Use k=4 based on Elbow
kmeans = MiniBatchKMeans(n_clusters=4, random_state=42)
df_log_clean = df_log.dropna(subset=['Building_trans', 'Road_trans', 'Bathymetry_trans', 'Other_points_trans', 'latitude', 'longitude']).copy()
df_log_clean['cluster'] = kmeans.fit_predict(X)

# Preview the result
df_log_clean.head()

--------------------------------------------------------------------------------------------------------------------------------------------------------

# Investigating the 500m buffer clusters

In [None]:
df_log_clean['cluster'].unique()

#### Checking cluster size

In [None]:
df_log_clean['cluster'].value_counts()

#### Comparing cluster means

In [None]:
# Compute means + count per cluster
cluster_table = (
    df_log_clean
      .groupby('cluster')
      .agg(
          n=('cluster', 'size'),
          **{
            'Building count': ('Building count', 'mean'),
            'Road length': ('Road length', 'mean'),
            'Bathymetry mean': ('Bathymetry mean', 'mean'),
            'Other points': ('Other points', 'mean'),})
      .sort_index()
      .round(2))

display(cluster_table)

# Saving
#os.makedirs("Method_results_images", exist_ok=True)

cluster_table.to_html("Method_results_images/K_Means_Cluster_Breakdown_Table.html", float_format="%.2f")


#### Smallest cluster is lowest building and the lowest amount of roads with only 1155 records, while the other 3 clusters show evidence of some urbanisation in the area, roads or buildings.

| Cluster | Summary                                                                                                                                                                                                |
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| **0**   | **Moderate human presence & road length**, high bathymetry mean (\~23.6), many other points. Possibly **semi-developed coastal** areas with decent road access and moderate depth.                     |
| **1**   | **Almost no buildings/roads**, moderate bathymetry (\~13.8), very low “other points”. Likely **remote/natural** areas with minimal human influence.                                                    |
| **2**   | **High human presence (buildings & roads)**, shallow bathymetry (\~9.8), moderate “other points”. This looks like **urbanised, shallow-water** areas — possibly harbours or nearshore urban stretches. |
| **3**   | **Low-moderate human presence**, moderate roads, bathymetry (\~14.8). Could be **suburban/coastal mixed-use** areas.                                                                                   |


-------------------------

## Visulising feature distribution per cluster

In [None]:
import matplotlib.pyplot as plt

# Variables to include
vars_to_plot = ["Building count", "Road length", "Bathymetry mean", "Other points"]

# Copy your data and normalise rows to sum to 1
norm_df = cluster_table[vars_to_plot].div(cluster_table[vars_to_plot].sum(axis=1), axis=0)

# Plot
fig, ax = plt.subplots(figsize=(8, 5))
bottom = np.zeros(len(norm_df))

for col in vars_to_plot:
    ax.bar(norm_df.index.astype(str), norm_df[col], bottom=bottom, label=col)
    bottom += norm_df[col].values

ax.set_ylabel("Proportion of total")
ax.set_xlabel("Cluster")
ax.set_title("Normalised composition of each cluster")
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1))
plt.tight_layout()
plt.savefig("Method_results_images/K_Means_Cluster_Stacked_Barchart.png", dpi=150, bbox_inches="tight")
plt.show()


In [None]:
vars_to_plot = ["Building count", "Road length", "Bathymetry mean", "Other points"]
radar_df = cluster_table.sort_index()[vars_to_plot].copy()

# Min–max scale per variable so shapes are comparable
radar_scaled = (radar_df - radar_df.min()) / (radar_df.max() - radar_df.min())
radar_scaled = radar_scaled.fillna(0.0)

# Radar setup
labels = vars_to_plot
num_vars = len(labels)
angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False)
angles = np.concatenate([angles, angles[:1]])  # close the loop

plt.figure(figsize=(7, 7))
ax = plt.subplot(111, polar=True)

for clust, row in radar_scaled.iterrows():
    values = row.values
    values = np.concatenate([values, values[:1]])  # close the loop
    ax.plot(angles, values, linewidth=2, label=f"Cluster {clust}")
    ax.fill(angles, values, alpha=0.1)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)
ax.set_yticks([0.0, 0.5, 1.0])
ax.set_yticklabels(["0", "0.5", "1.0"])
ax.set_ylim(0, 1)
ax.set_title("Cluster profiles (min–max scaled)")
ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
plt.tight_layout()
# Optional: save
plt.savefig("Method_results_images/K_Means_Cluster_Radar_Chart.png", dpi=150, bbox_inches="tight")
plt.show()


In [None]:
vars_to_plot = ["Building count","Road length","Bathymetry mean","Other points"]
Z = (cluster_table[vars_to_plot] - cluster_table[vars_to_plot].mean())/cluster_table[vars_to_plot].std()
plt.imshow(Z.values, aspect="auto")
plt.yticks(range(len(Z)), [f"Cluster {i}" for i in Z.index])
plt.xticks(range(len(vars_to_plot)), vars_to_plot, rotation=45)
plt.colorbar(label="z-score"); plt.title("Cluster feature heatmap");
plt.tight_layout();
plt.savefig("Method_results_images/K_Means_Cluster_Feature_Heatmap.png", dpi=150, bbox_inches="tight")
plt.show()




--------------------

## Mapping clusters

In [None]:
plt.figure(figsize=(8, 8))  
sns.scatterplot(
    data=df_log_clean,
    x='longitude',
    y='latitude',
    hue='cluster',
    palette='Set2')
plt.title("Cluster Locations")
plt.gca().set_aspect('equal', adjustable='box')
plt.savefig("Method_results_images/K_Means_cluser_map.png", dpi=150, bbox_inches="tight")
plt.show()


In [None]:
print(buffer_metrics_500.columns.tolist())
print("index name:", buffer_metrics_500.index.name)
print(buffer_metrics_500.filter(like='cluster').head())  # shows any cluster-like columns


In [None]:
if 'cluster' not in buffer_metrics_500.columns and buffer_metrics_500.index.name == 'cluster':
    buffer_metrics_500 = buffer_metrics_500.reset_index()

In [None]:
# Ensuring cluster column exists on buffer_metrics_500 
# build a clean lookup of (Point ID -> cluster)
clu = (df_log_clean[['Point ID', 'cluster']]
       .drop_duplicates('Point ID'))

# making sure dtypes match for the join key
buffer_metrics_500['Point ID'] = pd.to_numeric(buffer_metrics_500['Point ID'], errors='coerce')
clu['Point ID'] = pd.to_numeric(clu['Point ID'], errors='coerce')

# merging (each Point ID should map to exactly one cluster)
buffer_metrics_500 = (buffer_metrics_500
    .merge(clu, on='Point ID', how='left', validate='m:1')
    .dropna(subset=['cluster']))
buffer_metrics_500['cluster'] = buffer_metrics_500['cluster'].astype(int)

# Sanity check
assert 'cluster' in buffer_metrics_500.columns


# Color mapping per cluster
clusters = sorted(buffer_metrics_500['cluster'].unique())
palette = sns.color_palette("husl", n_colors=len(clusters))
hex_colors = [colors.to_hex(c) for c in palette]
color_map = dict(zip(clusters, hex_colors))

# Lables
labels = {
    2: 'High human footprint (shallow)',
    0: 'High roads, moderate buildings (deeper)',
    3: 'Low–moderate human footprint',
    1: 'Remote / undeveloped'}

# Fallback to "Cluster X" if a label is missing
def layer_name(c):
    return f"Cluster {c} – {labels.get(c, '')}".strip(' –')

# Creating a map
map_center = [buffer_metrics_500['latitude'].mean(), buffer_metrics_500['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=5, tiles='CartoDB positron', control_scale=True)

# One FeatureGroup per cluster 
layers = {
    c: folium.FeatureGroup(name=layer_name(c), show=True)
    for c in clusters}

# Add markers to their cluster layer
for _, row in buffer_metrics_500.iterrows():
    c = int(row['cluster'])
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=2,
        color=color_map[c],
        fill=True,
        fill_opacity=0.8,
        popup=(f"Cluster: {c} | "
               f"Buildings: {row['Building count']}, "
               f"Road length: {row['Road length']:.1f}, "
               f"Bathymetry mean: {row['Bathymetry mean']:.2f}, "
               f"Other points: {row['Other points']}")
    ).add_to(layers[c])

# Add the cluster layers to the map
for fg in layers.values():
    fg.add_to(m)

# Clickable layer control (checkboxes for each cluster)
folium.LayerControl(collapsed=False).add_to(m)

# Title
m.get_root().html.add_child(folium.Element("""
<div style="position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #777;
 border-radius: 6px; font-weight: 600; box-shadow: 0 1px 3px rgba(0,0,0,.2);">
 Strandings by Species (snapped points)
</div>
"""))

# --- Add a small static color legend (just for reference) ---
legend_html = """
<div style="
    position: fixed;
    bottom: 20px; left: 20px; z-index: 9999;
    background: white; padding: 10px 12px; border: 1px solid #ccc;
    border-radius: 6px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);
    font-size: 13px;">
  <div style="font-weight:600; margin-bottom:6px;">Clusters</div>
  {}
</div>
""".format(
    "".join(
        f'<div style="display:flex; align-items:center; margin:3px 0;">'
        f'<span style="display:inline-block; width:14px; height:14px; '
        f'background:{color_map[c]}; margin-right:6px; border:1px solid #888;"></span>'
        f'{layer_name(c)}</div>'
        for c in clusters))

m.get_root().html.add_child(folium.Element(legend_html))

m


In [None]:
m.save("Method_results_images/K_Means_UK_Ireland_Cluster_Map.html")

#### Link to K-means map [Netlify map](https://cluster-map-uk.netlify.app/)

#### Pairplot of cluster features

In [None]:
# Removing the warning output for astetics
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")
# Creating pairplot
sns.pairplot(
    df_log_clean,
    vars=['Building count', 'Road length', 'Bathymetry mean', 'Other points'],
    hue='cluster',
    palette='Set2')
plt.savefig("Method_results_images/K_Means_Cluster_Pairplot.png", dpi=150, bbox_inches="tight")
plt.show()


#### Principal Component Analysis (PCA) for dimentional reduction

In [None]:
#Creating 2D representation
pca = PCA(n_components=2)
# Fitting PCA and transforming data
X_pca = pca.fit_transform(df_log_clean[[
    'Building_trans',
    'Road_trans',
    'Bathymetry_trans',
    'Other_points_trans']])
# Adding 2 components back
df_log_clean['pca_1'] = X_pca[:,0]
df_log_clean['pca_2'] = X_pca[:,1]

# Visulising
plt.figure(figsize=(8,6))
sns.scatterplot(
    data=df_log_clean,
    x='pca_1',
    y='pca_2',
    hue='cluster',
    palette='Set2')

plt.title("Clusters visualized with PCA")
plt.savefig("Method_results_images/K_Means_Clusters_PCA.png", dpi=150, bbox_inches="tight")
plt.show()


#### The PCA visualization shows that clusters 0, 2, and 3 overlap substantially, forming a gradient of sites with broadly similar levels of human presence. In contrast, cluster 1 is clearly separated along the first principal component, representing the remote and undeveloped areas, which form a distinct subgroup in the dataset.



-----------------------------------------------------

In [None]:
features = ['Building count','Road length','Bathymetry mean','Other points']
loadings = pd.DataFrame(pca.components_.T, index=features, columns=['PC1','PC2'])
print(loadings.sort_values('PC1'))

# Save
#os.makedirs("Method_results_images", exist_ok=True)

cluster_table.to_html("Method_results_images/K_Means_Cluster_PC1_PC2_Table.html", float_format="%.2f")


#### The PCA loadings show that PC1 is dominated by road length, with smaller contributions from building count and other points, meaning this axis primarily reflects variation in human infrastructure. PC2, in contrast, is driven mainly by other points and bathymetry, capturing ecological and environmental differences. This supports the PCA scatterplot interpretation, where the remote and undeveloped cluster (cluster 1) separates clearly along PC1 due to its very low levels of infrastructure.

In [None]:
# Percent variance each PC explains
expl = pca.explained_variance_ratio_
print({f'PC{i+1}': f'{v*100:.1f}%' for i, v in enumerate(expl)})

# Top contributor per PC (by absolute loading)
abs_load = loadings.abs()
print(abs_load.idxmax())  # variable with largest impact for each PC


#### The PCA loadings show that PC1 is dominated by road length, with smaller contributions from building count and other points, meaning this axis primarily reflects variation in human infrastructure. PC2, in contrast, is driven mainly by other points and bathymetry, capturing ecological and environmental differences. This supports the PCA scatterplot interpretation, where the remote and undeveloped cluster (cluster 1) separates clearly along PC1 due to its very low levels of infrastructure.

In [None]:
# loadings table (you already have `loadings`)
loadings.round(3).to_html("Method_results_images/K_Means_PCA_Loadings.html")
#loadings.round(3).to_csv("Method_results_images/K_Means_PCA_Loadings.csv")

# explained variance table
expl_tbl = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(pca.explained_variance_ratio_))],
    'Explained variance (%)': (pca.explained_variance_ratio_*100).round(1)})

expl_tbl.to_html("Method_results_images/K_Means_PCA_Explained_Variance.html", index=False)
#expl_tbl.to_csv("Method_results_images/K_Means_PCA_Explained_Variance.csv", index=False)


## Continues with the Logistic Regression notebook