In [1]:
import geopandas as gpd
from sklearn.model_selection import train_test_split
import os
import pandas as pd

In [3]:

# Load the GeoJSON file
input_folder='output\Wagga'
input_geojson=os.path.join(input_folder,"Final_Wagga_training_samples.geojson")
geo_df = gpd.read_file(input_geojson)

# Define the attributes for stratification
stratify_columns = ['cluster','Ground_surveyed', 'AGE','WALL_M','USAGE','Ground_Level_bin']

# Ensure the stratify columns exist and handle missing data (optional)
geo_df = geo_df.dropna(subset=stratify_columns)

# Create a new column for stratification by combining the stratify columns
geo_df['stratify'] = geo_df[stratify_columns].astype(str).agg('_'.join, axis=1)

# Check class distribution
class_counts = geo_df['stratify'].value_counts()
problematic_classes = class_counts[class_counts < 2].index

# Handle problematic classes (Option A: Remove them)
geo_df = geo_df[~geo_df['stratify'].isin(problematic_classes)]

# Split the data into training and validation sets
train, val = train_test_split(
    geo_df,
    test_size=0.2,  # Adjust validation size (20% here)
    stratify=geo_df['stratify'],
    random_state=42  # For reproducibility
)

# Add a new column to indicate the split
train['split'] = 'train'
val['split'] = 'validation'

# Combine the datasets back into a single GeoDataFrame
combined = gpd.GeoDataFrame(pd.concat([train, val], ignore_index=True))

# Drop the auxiliary 'stratify' column
combined = combined.drop(columns=['stratify'])

# Export the combined dataset to GeoJSON
output_geojson = os.path.join(input_folder,"Final_Wagga_training_samples_split.geojson")
combined.to_file(output_geojson, driver="GeoJSON")

print(f"GeoJSON with split attribute saved to: {output_geojson}")


GeoJSON with split attribute saved to: output\Wagga\Final_Wagga_training_samples_split.geojson
