## Import modules

In [53]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Set input and output folders

In [70]:
input_folder='data'
out_folder='output\Wagga'

## Load points data

In [None]:
add_points=os.path.join(input_folder,"Final_Wagga.shp")
gdf_points=gpd.read_file(add_points).to_crs('epsg:4326')
gdf_points.head()

## Identify ground-surveyed group
- using standard step distance of 0.28 m

In [None]:
step_m=0.28
tolerance = 1e-12  # A small tolerance to handle floating-point precision
gdf_points['Floor_height']=gdf_points['Floor_Leve']-gdf_points['Ground_Lev']
gdf_points['Floor_height'] = gdf_points['Floor_height'].round(4) # Limit 'Floor_height'decimal places
gdf_points['Ground_surveyed']=1
gdf_points.loc[np.abs(gdf_points['Floor_height'] % step_m) < tolerance,'Ground_surveyed']=0
gdf_points.explore(column='Ground_surveyed')

## Spatially cluster the points

In [None]:
# Define the number of clusters
n_clusters = 5

# Convert geometries to numpy array of coordinates
coords = np.array(list(gdf_points.geometry.apply(lambda point: (point.x, point.y))))

# Perform KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
gdf_points['cluster'] = kmeans.fit_predict(coords)

gdf_points.explore(column='cluster')


## Bin continous attributes
- Ground elevation

In [None]:
n_bins = 3  # Number of bins
gdf_points['Ground_Level_bin'] = pd.qcut(gdf_points['Ground_Lev'], q=n_bins, labels=['Low','Medium','High'], duplicates='drop')
print(gdf_points['Ground_Level_bin'].value_counts())

In [None]:
# min_height = gdf_points['Floor_height'].min()
# max_height = gdf_points['Floor_height'].max()
# bins = np.linspace(min_height, max_height, n_bins + 1)
# # Create quantile-based bins for 'Floor_height'
# gdf_points['Floor_height_bin'] = pd.cut(gdf_points['Floor_height'], bins=n_bins, labels=['Low','Medium','High'])

# # Check the distribution of the bins
# print(gdf_points['Floor_height_bin'].value_counts())

## Perform stratified sampling using
- Spatial clustering
- Either or not ground-surveyed
- Age (before or after 1960)
- Wall material
- Usage
- Bined ground elevation

In [None]:
frac=0.25
sampled_dfs = []
columns = ['cluster','Ground_surveyed', 'AGE','WALL_M','USAGE','Ground_Level_bin']
# Group by both clusters and the binned columns
for (cluster_label, attr1, attr2, attr3, attr4, attr5), group in gdf_points.groupby(columns):
    if len(group) > 1:
        sampled_group = group.sample(frac=frac, random_state=42)
        sampled_dfs.append(sampled_group)

# Concatenate the sampled groups into a single GeoDataFrame
sampled_gdf = gpd.GeoDataFrame(pd.concat(sampled_dfs))
sampled_gdf.explore(column='cluster')

In [None]:
print(sampled_gdf[['Ground_Level_bin', 'WALL_M']].value_counts())

## export sampled points

In [71]:
output_file=os.path.join(out_folder,"Final_Wagga_training_samples.geojson")
sampled_gdf.to_file(output_file, driver="GeoJSON")