In [26]:
import os
import json
import fiona
import geemap
import numpy as np
import pandas as pd
import geopandas as gpd
import xml.etree.ElementTree as ET

from collections import Counter

from shapely.geometry import shape
from shapely.geometry import Point

# Gee and EE 
import ee

# Clustering (best for testing** due to speed)
from sklearn.cluster import KMeans

# tif file creation
import rasterio
from rasterio.transform import from_origin
from rasterio.features import rasterize

# Plotting and Vis 
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

import re

In [2]:
# Initialise
ee.Authenticate()
ee.Initialize(project="jameswilliamchamberlain")

In [3]:
# basemap 
basemap_url = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'

# centeral point of Samarra 
df_sites = pd.DataFrame({
    "longitude": [43.823543],
    "latitude": [34.340989],
    "name": ["Samarra Archaeological City"],
    "category": ["Cultural"],
    "date inscribed": ["2007"],
    "region": ["Arab States"],
    "url": ["https://whc.unesco.org/en/list/276"],
    "iso": [["IQ"]]
})

# Chunks of Samarra Archaeological City 
with fiona.open("chunks_new.shp") as src:
    chunks = gpd.GeoDataFrame.from_features(src, crs=src.crs)

In [4]:
## TEMP REMOVE ADDITIONAL STUFF FOR QUICKER TESTING 
# tile_43641125_34108721_43915744_34336837 only 

chunks = chunks[chunks['file_name'] == 'tile_43641125_34108721_43915744_34336837']

In [5]:
m = geemap.Map()

if df_sites.empty:
    print("No sites found for the specified URL.")
else:
    m.add_points_from_xy(df_sites, x="longitude", y="latitude", layer_name="Sites")
    center_points = df_sites[['longitude', 'latitude']].mean().values
    m.setCenter(center_points[0], center_points[1], 10)

m.add_basemap(basemap_url, name="Google Satellite", attribution="Google")

# add chunks from aoi 
m.add_gdf(chunks, layer_name="AOI", style={"color": "red", "fillColor": "red", "fillOpacity": 0.1})

m

Map(center=[34.340989, 43.823543], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=Sear…

In [None]:
def create_subregions(chunks, sift_percentage_lon=0.5, sift_percentage_lat=0.415):
    """
        Shifts all polygons to 8 positions based on half the length in longitude and latitude, to create subregions - one for each direction from the center.

        Assumes aoi contains all similar polygons, and are similar to a square.

        Parameters:
            aoi (GeoDataFrame): The area of interest containing geometries.
            sift_percentage (float): Percentage of the length to shift the center point. Default is 0.5 (50%) for half the length.
    """

    if chunks.empty:
        return {}
    
    first_polygon = chunks.geometry.iloc[0]

    # take top two points of the polygon and get the length between them
    top_points = first_polygon.exterior.coords[:2]
    length_lon = abs(top_points[0][0] - top_points[1][0])

    # calculate the shift amount
    shift_amount_lon = length_lon * sift_percentage_lon 
    shift_amount_lat = length_lon * sift_percentage_lat

    shift_directions = {
        "left": (-shift_amount_lon, 0),
        "right": (shift_amount_lon, 0),
        "up": (0, shift_amount_lat),
        "down": (0, -shift_amount_lat),
        "top_left": (-shift_amount_lon, shift_amount_lat),
        "top_right": (shift_amount_lon, shift_amount_lat),
        "bottom_left": (-shift_amount_lon, -shift_amount_lat),
        "bottom_right": (shift_amount_lon, -shift_amount_lat),
    }

    subregions = []
        
    # Create subregions by shifting the geometries in all directions
    for _, (dx, dy) in shift_directions.items():
        gdf_shifted = chunks.copy()
        gdf_shifted["geometry"] = gdf_shifted["geometry"].translate(dx, dy)
        subregions.append(gdf_shifted)

    # Combine all into one GeoDataFrame
    subregions = pd.concat(subregions, ignore_index=True)
    return gpd.GeoDataFrame(subregions, crs=chunks.crs)

def clip(chunks, aoi):
    """
        Clips the chunks or subregions to the area of interest (aoi).

        Parameters:
            chunks (GeoDataFrame):      The chunks or subregions to be clipped.
            aoi (GeoDataFrame):         The area of interest (aoi) to clip the chunks against.
    """

    # clip to aoi 
    if chunks.empty or aoi.empty:
        return gpd.GeoDataFrame(columns=chunks.columns.tolist(), crs=chunks.crs)
    clipped = gpd.clip(chunks, aoi)
    clipped = clipped[clipped.geometry.notnull()]  # Remove any null geometries

    return clipped.reset_index(drop=True)

subregions = create_subregions(chunks)
subregions = clip(subregions, aoi=chunks.dissolve())

# plot as one 
m.add_gdf(subregions, layer_name="Subregions", style={"color": "blue", "fillColor": "blue", "fillOpacity": 0.1})

# Reference Points

In [7]:
def collect_points_from_geemap(map_obj, label):
    """
    Collect all drawn point features from a geemap.Map that uses ee.Feature objects,
    and return them as a GeoDataFrame with a label.

    Parameters:
        map_obj (geemap.Map): The interactive map.
        label (str): Label to assign to all collected points.

    Returns:
        GeoDataFrame: With geometry and 'label' columns.
    """

    features = map_obj.draw_features

    if not features:
        return gpd.GeoDataFrame(columns=["geometry", "label"], geometry="geometry")

    points = []
    for f in features:
        try:
            geom = f.geometry()  # call the method
            if geom.getInfo()["type"] == "Point":
                coords = geom.coordinates().getInfo()  # [lon, lat]
                points.append(Point(coords))
        except Exception as e:
            print("Skipping feature due to error:", e)

    if not points:
        return gpd.GeoDataFrame(columns=["geometry", "label"], geometry="geometry")

    gdf = gpd.GeoDataFrame(geometry=points)
    gdf["label"] = label
    gdf.set_crs("EPSG:4326", inplace=True)
    return gdf

def gen_basemap(basemap_url=None, aoi=gpd.GeoDataFrame(), polygons=None):
    """
        Generates a basemap with the specified URL and adds polygons if provided.

        Parameters:
            basemap_url (str): The URL of the basemap to be used.
            polygons (GeoDataFrame, optional): Polygons to be added to the map. Defaults to None.
            aoi (GeoDataFrame): The area of interest to be displayed on the map. (can be a set of polygons or a single polygon)
        
        Returns:
            geemap.Map: A geemap map object with the specified basemap and polygons.
    """
    m = geemap.Map()

    if not basemap_url:
        basemap_url = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'
        
    m.add_basemap(basemap_url, name="Google Satellite", attribution="Google")

    # center on aoi
    if not aoi.empty:
        m.add_gdf(aoi, layer_name="AOI", style={"color": "red", "fillColor": "red", "fillOpacity": 0.1})
        center_points = aoi.geometry.unary_union.centroid.coords[0]
        m.setCenter(center_points[0], center_points[1], 10)
    
    if polygons is not None:
        m.add_gdf(polygons, layer_name="Polygons", style={"color": "red", "fillColor": "red", "fillOpacity": 0.1})
    
    return m

In [8]:
# Example of 3 maps for urban, agricultrual, water and waste
urban_map = gen_basemap(basemap_url, aoi=chunks)
agricultural_map = gen_basemap(basemap_url, aoi=chunks)
water_map = gen_basemap(basemap_url, aoi=chunks)
wasteland_map = gen_basemap(basemap_url, aoi=chunks)
# wasteland_map = geemap.Map()

In [9]:
urban_map

Map(center=[34.22277923307027, 43.77843455433748], controls=(WidgetControl(options=['position', 'transparent_b…

In [10]:
agricultural_map

Map(center=[34.22277923307027, 43.77843455433748], controls=(WidgetControl(options=['position', 'transparent_b…

In [11]:
# water_map = water_waste_map
water_map

Map(center=[34.22277923307027, 43.77843455433748], controls=(WidgetControl(options=['position', 'transparent_b…

In [12]:
wasteland_map

Map(center=[34.22277923307027, 43.77843455433748], controls=(WidgetControl(options=['position', 'transparent_b…

In [28]:
# convert to labelled points in GeoDataFrame

water_points = collect_points_from_geemap(water_map, label="Water")
agricultural_points = collect_points_from_geemap(agricultural_map, label="Agricultural")
urban_points = collect_points_from_geemap(urban_map, label="Urban")
wasteland_map = collect_points_from_geemap(wasteland_map, label="Wasteland")


AttributeError: 'GeoDataFrame' object has no attribute 'draw_features'

In [14]:
def save_points_to_csv(points, filename):
    points.to_csv(filename, index=False)
    print(f"Points saved to {filename}")

def load_points_from_csv(dir):
    try:
        points = gpd.read_file(dir)
        print(f"Points loaded from {dir}")
        return points
    except Exception as e:
        print(f"Error loading points from {dir}: {e}")
        return gpd.GeoDataFrame(columns=["geometry", "label"], geometry="geometry")

# save_points_to_csv(water_points, "test1_water_points.csv")
# save_points_to_csv(agricultural_points, "test1_agricultural_points.csv")
# save_points_to_csv(urban_points, "test1_urban_points.csv")
# save_points_to_csv(wasteland_map, "test1_wasteland_points.csv")

In [30]:
# merge into one GeoDataFrame
points_list = [water_points, agricultural_points, urban_points, wasteland_map]
labels = pd.concat(points_list, ignore_index=True)

print(len(labels))

21


# Clustering
Here we create a Sparse matrix

then with the sparse matrix compare and label based on that

In [16]:
pth = "data/"

# collect paths for all csv files in the folder
paths = []

for root, dirs, files in os.walk(pth):
    for file in files:
        if file.endswith(".csv"):
            paths.append(os.path.join(root, file))

print(f"Found {len(paths)} CSV files.")

# select 2024 
path = [p for p in paths if "2019" in p]
path


Found 6 CSV files.


['data/2019.csv']

In [17]:
def prep_data(dir):
    """Creates two dataframes from a CSV file, one ready for clustering with additional columns removed and the other with all columns left intact."""
    df1 = pd.read_csv(dir)
    df1 = df1.dropna()
    df2_clear = df1.copy()
    df2_clear = df2_clear.drop(columns=["system:index", ".geo"])
    df2_clear = df2_clear.set_index("file_name")
    df1 = df1.set_index("file_name")
    df2_clear = df2_clear.apply(pd.to_numeric, errors='coerce')

    return df1, df2_clear

df1, df2_clear = prep_data(path[0])

In [18]:

class tif_utils:
    """
        A set of utility functions for GeoTIFF files 
    """



    




# # predictions to tif
# output_dir = "/test1_1_predictions.tif"
# export_to_tif(predictions, bands=["numeric_label"], output_dir=output_dir, res=50, UTM_ESPG=32638, EPSG=4326)

In [None]:
def kmeans_clustering(df, k=10):
    """sklearn kmeans"""

    kmeans = KMeans(n_clusters=k, random_state=42) # TODO: make random_state random
    kmeans.fit(df)
    
    return kmeans.labels_, df.index.tolist()


class cluster:
    """
        Takes a set of polygons and a set of points with attached geometry 

        and runs clustering over all points in the polygons, and returns a set of clusters based on the points.
    """

    sparse_matrix = pd.DataFrame()

    def __init__(self, subregions, df_data, mapping, passes=6, aoi=None, index_column="file_name", points=gpd.GeoDataFrame()):
        """
            Initialises the cluster object with subregions, data, number of subregion passes, the area of interest (aoi), index column, and points.
        """

        # execution variables 
        self.passes = passes 

        # Data 
        self.data = df_data
        self.points = points
        self.mapping = mapping

        # Subregions 
        if aoi is None:
            aoi = subregions.dissolve()

        subregions_poly = create_subregions(chunks)
        subregions_poly = clip(subregions, aoi=aoi)
        self.subregions = subregions_poly

        self.index_column = index_column
        self.UTM_ESPG = 32638
        self.EPSG = 4326

        self.labels = None
        self.sparse_matrix = pd.DataFrame()
    
    def clip_dataframe(self, polygon, df):
        """
            Clips the DataFrame based on the .geo column and a given polygon.   
        
        Args:
            polygon (shapely.geometry.Polygon): The polygon to clip the DataFrame to.
            df (pd.DataFrame): The DataFrame containing the geometries to be clipped.
            
        Returns:
            pd.DataFrame: The clipped DataFrame.
        """

        # based on .geo column drop all rows that do not intersect with the polygon
        df['geometry'] = df['.geo'].apply(lambda x: shape(json.loads(x)))
        gdf = gpd.GeoDataFrame(df, geometry='geometry', crs=self.UTM_ESPG)
        gdf = gdf[gdf.geometry.within(polygon)]
        gdf = gdf.drop(columns=['geometry'])

        # convert back to DataFrame
        df_clipped = pd.DataFrame(gdf)

        return df_clipped
    
    def clip_polygon(chunks, aoi):
        """
            Clips the chunks or subregions to the area of interest (aoi).

            Parameters:
                chunks (GeoDataFrame):      The chunks or subregions to be clipped.
                aoi (GeoDataFrame):         The area of interest (aoi) to clip the chunks against.
        """

        # clip to aoi 
        if chunks.empty or aoi.empty:
            return gpd.GeoDataFrame(columns=chunks.columns.tolist(), crs=chunks.crs)
        clipped = gpd.clip(chunks, aoi)
        clipped = clipped[clipped.geometry.notnull()]  # Remove any null geometries

        return clipped.reset_index(drop=True)
    
    def split_df(self, df):
        """
            Creates two dataframes from a CSV file, one ready for clustering with additional columns removed and the other with all columns left intact.
        """

        df1 = df.copy()

        df1 = df1.dropna()
        df2_clear = df1.copy()
        df2_clear = df2_clear.drop(columns=["system:index", ".geo"])
        df2_clear = df2_clear.set_index("file_name")
        df1 = df1.set_index("file_name")
        df2_clear = df2_clear.apply(pd.to_numeric, errors='coerce')

        return df1, df2_clear
    
    def create_subregions(chunks, sift_percentage=0.5):
        """
            TODO: Replace with updated version above
        """

        if chunks.empty:
            return {}
        
        first_polygon = chunks.geometry.iloc[0]

        # take top two points of the polygon and get the length between them
        top_points = first_polygon.exterior.coords[:2]
        length_lon = abs(top_points[0][0] - top_points[1][0])

        # calculate the shift amount
        shift_amount = length_lon * sift_percentage

        shift_directions = {
            "left": (-shift_amount, 0),
            "right": (shift_amount, 0),
            "up": (0, shift_amount),
            "down": (0, -shift_amount),
            "top_left": (-shift_amount, shift_amount),
            "top_right": (shift_amount, shift_amount),
            "bottom_left": (-shift_amount, -shift_amount),
            "bottom_right": (shift_amount, -shift_amount),
        }

        subregions = []

        # Create subregions by shifting the geometries in all directions
        for _, (dx, dy) in shift_directions.items():
            gdf_shifted = chunks.copy()
            gdf_shifted["geometry"] = gdf_shifted["geometry"].translate(dx, dy)
            subregions.append(gdf_shifted)

        # Combine all into one GeoDataFrame
        subregions = pd.concat(subregions, ignore_index=True)
        return gpd.GeoDataFrame(subregions, crs=chunks.crs)
    
    @staticmethod
    def convert_df_to_geodf(df, geo_col='.geo', crs="EPSG:32638"):
        """
        Converts a DataFrame with a '.geo' column (GeoJSON strings) to a GeoDataFrame.
        """
        df = df.copy()

        if df.empty:
            return gpd.GeoDataFrame(columns=df.columns.tolist(), crs=crs)
        if df is type(gpd.DataFrame):
            return df 

        # Only parse if the entry is a string
        def safe_parse(x):
            if isinstance(x, str):
                try:
                    return shape(json.loads(x))
                except Exception as e:
                    print(f"[WARNING] Bad geometry skipped: {x[:30]}... ({e})")
            return None

        df['geometry'] = df[geo_col].apply(safe_parse)
        df = df[df['geometry'].notnull()]  # drop invalid rows

        return gpd.GeoDataFrame(df, geometry='geometry', crs=crs)
    
    ##################################################################################################################################################################
    # ================================================================== SAVE AND RESTORE ========================================================================== #
    ##################################################################################################################################################################

    def save_state(self, filname_prefix="temp_test/test2", filename_postfix="_state"):
        """
            Saves the current state of the cluster object to files for later use, only preserving class specifc data and not the spectral/time-series data.
        """


        # Save the Sparse Matrix (if exists)
        if self.sparse_matrix.empty:
            print("No sparse matrix to save.")
        else:
            df_sparse = self.sparse_matrix.copy()
            df_sparse = df_sparse.reset_index(drop=False) # Preserve index as its a uniuqe identifier external to class
            df_sparse.to_csv(f"{filname_prefix}{filename_postfix}_sparse_matrix.csv", index=False)


        # Save the Points and Labelled Data
        if self.points.empty:
            print("No points to save.")
        else:
            self.points.to_file(f"{filname_prefix}{filename_postfix}_points.geojson", driver='GeoJSON')


        # Save Labelled Data 
        if self.labels.empty:
            print("No assigned labels to save.")
        else:
            df_labels = self.labels.copy()
            df_labels = df_labels.reset_index(drop=False)
            df_labels = df_labels[['file_name', 'label']]  # Keep only core columns (.geo comes back later with data)
            df_labels.to_csv(f"{filname_prefix}{filename_postfix}_labels.csv", index=False) 


        # Save Subregions
        subregions = self.subregions.copy()
        if subregions.empty:
            print("No subregions to save.")
        else:
            subregions.to_file(f"{filname_prefix}{filename_postfix}_subregions.geojson", driver='GeoJSON')  

    def reload_state(self, filname_prefix="temp_test/test2", filename_postfix="_state"):
        """
            Reloads the state of the cluster object from files.
        """

    
        # Load Sparse Matrix (if exists)
        df_sparse = pd.read_csv(f"{filname_prefix}{filename_postfix}_sparse_matrix.csv")
        df_sparse = df_sparse.set_index("file_name")
        self.sparse_matrix = df_sparse
        

        # Load Points (if exists)
        dir_points = f"{filname_prefix}{filename_postfix}_points.geojson"
        with fiona.open(dir_points) as src:
                points = gpd.GeoDataFrame.from_features(src, crs=src.crs)
        self.points = points


        # Load Labels (if exists)
        data = self.data.copy()
        data = data.reset_index(drop=False)
        gdf_labels = pd.read_csv(f"{filname_prefix}{filename_postfix}_labels.csv")
        gdf_labels = gdf_labels.merge(data, on='file_name', how='inner') # ensures re-useabilty of labells on other data 
        gdf_labels['label'] = gdf_labels['label'].replace({np.NaN: None, "NaN": None, "None": None})
        gdf_labels = gdf_labels.set_index("file_name") 
        self.labels = gdf_labels

        
        # Load Subregions (if exists)
        dir_subregions = f"{filname_prefix}{filename_postfix}_subregions.geojson"
        with fiona.open(dir_subregions) as src:
            subregions = gpd.GeoDataFrame.from_features(src, crs=src.crs)
        self.subregions = subregions
        
        # return df_sparse, points, gdf_labels, subregions

    ##################################################################################################################################################################
    # ================================================================== EXPORT FUNCTIONS ========================================================================== #
    ##################################################################################################################################################################

    # generate tif 
    @staticmethod
    def os_path_chcker(output_dir, postfix=".tif", NAME_CODE_LIM=8, FLAG_APPEND_POSTFIX=True):
        """
            Ensures that the output directory is valid, exists and can be written to.

            This is a simple function that does three things:
                1. Checks if the directory exists, if not creates it.
                2. Checks if the file name is valid (and not empty, else creates a unique filename).
                3. (Optional) Appends a postfix to the file name if it does not already end with it.

        Args:
            output_dir (str): The directory path to check.
            postfix (str, optional): The postfix to append to the file name if it does not already end with it. Default is ".tif".
            NAME_CODE_LIM (int, optional): The length of the hex code to generate for filename. Default is 8.
            FLAG_APPEND_POSTFIX (bool, optional): Whether to append the postfix if the file name does not end with it. Default is True.
        """


        # path existance check
        if not os.path.exists(os.path.dirname(output_dir)):
            os.makedirs(os.path.dirname(output_dir))
            print(f"Created directory: {os.path.dirname(output_dir)}")

        # file name existance check
        flag_filename = not os.path.basename(output_dir) or output_dir.endswith("/")
        while flag_filename:
            print("Generating new file name...")
            # assign unique name to the tif file
            new_file_name = f"{os.urandom(NAME_CODE_LIM).hex()}{postfix}"
            if not os.path.exists(output_dir + new_file_name):
                output_dir += new_file_name
                flag_filename = False
                print(f"New File Name: {output_dir}")

        # check labelled correctly 
        if FLAG_APPEND_POSTFIX and not output_dir.endswith(postfix):
            print(f"Output directory {output_dir} does not end with .tif, appending .tif")
            output_dir += {postfix}
            
        return output_dir

    def export_to_tif(self, df, bands, output_dir, res=50, UTM_ESPG=32638, EPSG=4326):
        """
            Exports a DataFrame generated by PlotToSat to a GeoTIFF file. 

            Note: df files MUST CONTAIN:
            - `.geo` column with GeoJSON geometries.
            - `file_name` column with unique identifiers for each row.
            - 1 Unique band labelled in `bands` list - this CAN BE LABELLED CLUSTERS. 

            Expected formats:
                Time-Series data should be in a DataFrame with 'month_band' columns (e.g., '0_B1', '1_B2', etc.).
                if a band is provideda e.g.m B1 but there are no pre-fix values this data will be assumed to be a single band AS-IS, and will be included in the output just as that band alone. 

            Args:
                df (pd.DataFrame): PlotToSat Style pandas DataFrame containing time-series data or single band data. Must have file_name and `.geo` geometry columns.
                bands (list): List of band names to include in the output. e.g., ['B1', 'B2', 'B3'] or ['SingleBand', etc.] can be mixed with single band data.
                output_dir (str): Output file directory for the GeoTIFF.
                res (int, optional): Resolution of the output raster in meters. Default is 50m.
                UTM_ESPG (int, optional): EPSG code for the UTM coordinate reference system. Default is 32638.
                


                # EPSG (int, optional): EPSG code for the coordinate reference system.
        """

        # Ensure output directory is valid, exists and can be written to.
        file = self.os_path_chcker(output_dir, postfix=".tif", NAME_CODE_LIM=8, FLAG_APPEND_POSTFIX=True)


        # Sort the Bands into single and time-series band data. 
        column_heads = df.columns.tolist()

        
        # all available bands in the data
        band_columns = [col for col in column_heads if any(col.endswith(band) for band in bands)]


        # Acceptable bands to process 
        acceptable_lst = []
        for col in bands:
            if any(col.startswith(f"{i}_") for i in range(12)):
                # must be non-prefix and only one band of that ts 
                if col not in acceptable_lst and any(coli for coli in band_columns if coli == col):
                    acceptable_lst.append(col)
                # acceptable_lst.append(col)
            else:
                # must exist in band_columns and take all 
                appended = False
                for i in range(12):
                    if any(coli for coli in band_columns if coli == f"{i}_{col}"):
                        if f"{i}_{col}" not in acceptable_lst:
                            # add only if not already in list     
                            acceptable_lst.append(f"{i}_{col}")
                            appended = True

                if appended == False:
                    # if not appended must be a unique column 
                    acceptable_lst.append(col)
        
        band_columns = acceptable_lst

    
        geometry = df[".geo"].apply(lambda x: shape(json.loads(x)))
        gdf = gpd.GeoDataFrame(df[band_columns].copy(), geometry=geometry, crs=f"EPSG:{EPSG}") # EPSG not UTM_ESPG else will raise an error

        gdf_utm = gdf.to_crs(epsg=UTM_ESPG)

        minx, miny, maxx, maxy = gdf_utm.total_bounds
        width = int((maxx - minx) / res)
        height = int((maxy - miny) / res)

        transform = from_origin(minx, maxy, res, res)

        if width <= 0 or height <= 0:
            raise ValueError(f"Invalid raster dimensions (width={width}, height={height}). Check CRS and resolution.")


        # Build Raster Stack
        rasters = []
        for band in band_columns:
            values = gdf_utm[band]
            shapes = ((geom, val) for geom, val in zip(gdf_utm.geometry, values))
            raster = rasterize(
                shapes=shapes,
                out_shape=(height, width),
                transform=transform,
                dtype="float32",
                fill=np.NaN,  # Fill with NaN values 
            )

            rasters.append(raster)

        raster_stack = raster_stack = np.stack(rasters, axis=0) if len(rasters) >= 2 else rasters


        # Save rasters to GeoTIFF 
        with rasterio.open(
            file,
            "w",
            driver="GTiff",
            height=height,
            width=width,
            count=len(band_columns),
            dtype="float32",
            crs=f"EPSG:{UTM_ESPG}",
            transform=transform,
            nodata=np.NaN,  # Set NoData value to NaN
        ) as dst:
            for i, band in enumerate(band_columns):
                dst.write(raster_stack[i], i + 1)
                dst.set_band_description(i + 1, band) # Keep band discription!!!


    def __convert_to_map__(self):
        """
        Converts the sparse DataFrame of multiple cluster label columns to a single-column label prediction
        based on majority voting from known labels.

        Args:
            df_sparse (pd.DataFrame): DataFrame with 'cluster_label_*' columns and index as file_name.
            df_data (pd.DataFrame): Original data containing '.geo' and any other metadata.
            gdf_labelled (GeoDataFrame): GeoDataFrame with true labels and index as file_name.

        Returns:
            pd.DataFrame: DataFrame indexed by file_name, with predicted label (or None if unassignable).
        """

        df_sparse = self.sparse_matrix.copy()
        df_data = self.data.copy()
        gdf_labelled = self.labels.copy()

        # Ensure labels column exists
        if 'label' not in gdf_labelled.columns:
            raise ValueError("gdf_labelled must contain a 'label' column with ground truth labels.")

        predictions = pd.Series(index=df_sparse.index, dtype=object)

        for col in df_sparse.columns:
            if not col.startswith("cluster_label_"):
                continue

            # Get cluster IDs and their associated true labels
            cluster_ids = df_sparse[col]
            known_labels = gdf_labelled['label']

            # Build mapping: cluster_id -> list of known labels
            cluster_to_labels = {}
            for file_name, cluster_id in cluster_ids.items():
                if file_name in known_labels and pd.notna(known_labels[file_name]):
                    cluster_to_labels.setdefault(cluster_id, []).append(known_labels[file_name])

            # Compute majority label for each cluster
            cluster_to_majority = {}
            for cluster_id, labels in cluster_to_labels.items():
                if not labels:
                    cluster_to_majority[cluster_id] = None
                else:
                    label_counts = Counter(labels)
                    most_common = label_counts.most_common()
                    top_label = most_common[0][0] if len(most_common) == 1 or most_common[0][1] != most_common[1][1] \
                        else np.random.choice([l for l, c in most_common if c == most_common[0][1]])
                    cluster_to_majority[cluster_id] = top_label

            # Assign predicted label per row
            for idx in df_sparse.index:
                cluster_id = df_sparse.at[idx, col]
                label = cluster_to_majority.get(cluster_id, None)
                if pd.isna(predictions.at[idx]) and label is not None:
                    predictions.at[idx] = label

        # Create output DataFrame
        result_df = pd.DataFrame({'predicted_label': predictions})

        # Join metadata like `.geo` if needed
        if '.geo' in df_data.columns:
            result_df = result_df.join(df_data['.geo'])

        return result_df


    def create_map(self, filename=""):
        predictions = self.__convert_to_map__()

        # tif creation
        predictions['numeric_label'] = predictions['predicted_label'].map(self.mapping)
        output_path = self.os_path_chcker(filename)
        self.export_to_tif(predictions, bands=['numeric_label'], output_dir=output_path)

        return predictions

    ##################################################################################################################################################################
    # =============================================================== PRE-PROCESSING =============================================================================== #
    ##################################################################################################################################################################

    # def update_row_labels(self, gdf_labels):
    #     df_gdf = self.gdf_labels.copy()
    #     labels_gdf = gdf_labels.copy()

    #     df_gdf['class'] = None
    #     for _, label_row in labels_gdf.iterrows():
    #         print(f"progress: {_}/{len(labels_gdf)} labels processed.")
    #         label = label_row['label']
            
    #         # Check each point until assigned or dropped
    #         for idx, point in df_gdf.iterrows():
    #             if label_row.geometry.intersects(point.geometry):
    #                 df_gdf.at[idx, 'class'] = label
    #                 break

    #     self.gdf_labels = df_gdf

    #     return df_gdf


    def build_row_labels(self, label_row='label'):
        """
        Builds a DataFrame with labelled polygons based on the points in the GeoDataFrame.

        Args:
            df (GeoDataFrame): The GeoDataFrame containing the points.
            labels (GeoDataFrame): The GeoDataFrame containing the polygons and their labels.

        Returns:
            pd.DataFrame: A DataFrame with the labels for each point.
        """


        # Prepare and Ensure in GeoDataFrame format
        df_gdf = self.data.copy()
        if df_gdf is type(pd.DataFrame):
            df_gdf = self.convert_df_to_geodf(df_gdf, geo_col='.geo', crs=f"EPSG:{self.UTM_ESPG}")
        labels_gdf = self.points.copy()

        # check for intersections and assign labels
        df_gdf['label'] = None
        for _, label_row in labels_gdf.iterrows():
            print(f"progress: {_}/{len(labels_gdf)} labels processed.")
            label = label_row['label']
            
            # Check each point until assigned or dropped
            for idx, point in df_gdf.iterrows():
                if label_row.geometry.intersects(point.geometry):
                    df_gdf.at[idx, 'label'] = label
                    break

        self.labels = df_gdf

        return df_gdf
    
    ##################################################################################################################################################################
    # ================================================================ Label Recomendations ======================================================================== #
    ##################################################################################################################################################################

    def create_recommendations(self, filename="/"):
        """
            Based on the Points Creates a map of recommendations for labelling the points in the subregions. 

            The higher the value the higher the recommendation to label the point. 
        """

        # resolution (m)
        res = 50

        # labels 
        labels = self.points.copy()
        labelled_data = self.labels.copy()
        data = self.data.copy()

        # # height map from truth labels 
        # max_height = 100.0
        # data['distance_value'] = 0.0 # old is 0.0

        # for _, label_geometry in labels.iterrows():
        #     print(f"progress: {_}/{len(labels)} labels processed.")
        #     label = label_geometry['label']
        #     if pd.isna(label):
        #         continue
            
        #     # Calculate distance map where lowest values are the points closest to the label geometry
        #     distances = data.geometry.distance(label_geometry.geometry)
        #     # data['distance_value'] += (max_height - distances) / max_height
        #     data['distance_value'] += distances / max_height

        # # normalise by the number of labels
        # if len(labels) > 0:
        #     data['distance_value'] /= len(labels)

        # # flip vlaues by max to create a recommendation map
        # data['distance_value'] = max_height - data['distance_value']

        max_height = 100.0
        data['distance_value'] = 0.0 

        for _, label_geometry in labels.iterrows():
            print(f"progress: {_}/{len(labels)} labels processed.")

            distances = data.geometry.distance(label_geometry.geometry) 

            # if distance is greater than 500m away ignore so set to 0 
            distances[distances > 500] = 0.0

            # normal around the point from 100 to 0.0 at edge
            data['distance_value'] += (max_height - distances) / max_height

            # take max value when compared between data['distance_value'] and the new value
            data['distance_value'] = data['distance_value'].combine(data['distance_value'], max)  
            
        try:
            self.export_to_tif(data, bands=['distance_value'], output_dir=self.os_path_chcker(filename, postfix=".tif", NAME_CODE_LIM=8, FLAG_APPEND_POSTFIX=True), res=50, UTM_ESPG=self.UTM_ESPG, EPSG=self.EPSG)
        except Exception as e:
            print(f"Error: {e}, \n Could not export recommendations to TIF, Returing DataFrame, so you can try again (This may be filename related!)")

        return data
            
    ##################################################################################################################################################################
    # ========================================================================== CLUSTERING ======================================================================== #
    ##################################################################################################################################################################
        
    def fit(self, cluster_fn, index_column='file_name'):
        """
            Fits the clustering model to the data.
            This method should be implemented in subclasses.
        """

        subregions_data = []

        for _, subregion in self.subregions.iterrows():
            # clip the data to the subregion
            # based on .geo column drop all rows that do not intersect with the polygon

            df_clipped = self.clip_dataframe(subregion.geometry, self.data)
            df_clipped = df_clipped.dropna()
            df_clipped = df_clipped.drop(columns=["system:index", ".geo"]) # unessusary columns for clustering
            # df_clipped = df_clipped.set_index("file_name")
            df_clipped = df_clipped.apply(pd.to_numeric, errors='coerce')

            if df_clipped.empty:
                print("No data points found in this subregion.")
                continue

            subregions_data.append(df_clipped)

        matrix = pd.DataFrame()
        index_number = 0

        for subregion_df in subregions_data:
            print(f"Clustering {len(subregion_df)} points from subregion.")

            for i in range(self.passes):
                # Perform clustering
                labels, indecies = cluster_fn(subregion_df)

                col_name = f'cluster_{index_number}'

                # Temporary DataFrame
                temp_df = pd.DataFrame({col_name: labels, 'file_name': indecies})

                # Group in case of duplicates
                temp_grouped = temp_df.groupby('file_name')[col_name].first()

                # Convert to DataFrame for merging
                temp_grouped = temp_grouped.to_frame()

                print(f"temp columns: {temp_grouped.columns}")

                # Merge with main matrix
                matrix = matrix.join(temp_grouped, how='outer') if not matrix.empty else temp_grouped

                index_number += 1
                print(len(matrix.columns), "columns in the matrix after clustering.")
                print(len(matrix), "rows in the matrix after clustering.")

        matrix = matrix.join(self.data[[".geo"]], how='outer')
        self.sparse_matrix = matrix

        return matrix
    

In [122]:
mapping = {
            None: 0,
            np.NaN: 0,
            "NaN": 0,
            "None": 0,
            "Water": 1,
            "Agricultural": 2,
            "Urban": 3,
            "Wasteland": 4,
        }

In [123]:
c = cluster(chunks, df1, points=labels, mapping=mapping, passes=6)

In [125]:
c.reload_state(filname_prefix="temp_test/test3", filename_postfix="_state")

In [None]:
c.sparse_matrix.head()

Unnamed: 0_level_0,cluster_label_0,.geo
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1
tile_43637469_34558614_43638574_34559528,,"{""type"":""Polygon"",""coordinates"":[[[43.63748527..."
tile_43637484_34557712_43638589_34558626,,"{""type"":""Polygon"",""coordinates"":[[[43.63749867..."
tile_43637499_34556811_43638603_34557725,,"{""type"":""Polygon"",""coordinates"":[[[43.63751207..."
tile_43637514_34555909_43638618_34556823,,"{""type"":""Polygon"",""coordinates"":[[[43.63752986..."
tile_43637528_34555008_43638633_34555922,,"{""type"":""Polygon"",""coordinates"":[[[43.63754326..."


In [130]:
c.labels

AttributeError: 'cluster' object has no attribute 'labels'

In [None]:
mapping = {
            None: 0,
            np.NaN: 0,
            "NaN": 0,
            "None": 0,
            "Water": 1,
            "Agricultural": 2,
            "Urban": 3,
            "Wasteland": 4,
        }


# # THIS WORKS 
c = cluster(chunks, df1, points=labels, mapping=mapping, passes=6)
c.build_row_labels()
# c.reload("temp_test/test1_data.geojson", "temp_test/test1_labels.geojson", "temp_test/test1_sparse.csv", "temp_test/test1_points.geojson")
# c.create_recommendations()
# # c.create_map("/temptest2.tif")

# NOTE: test1_sparse.csv does NOT HAVE CORRECT DATA 



# c.build_row_labels() # TODO: add passes 
# self.data = gpd.read_file(dir_data)
# self.gdf_labels = gpd.read_file(dir_labels)
# self.df_sparse = pd.read_csv(dir_sparse, index_col=0)
# self.points = gpd.read_file(dir_points)

progress: 0/21 labels processed.
progress: 1/21 labels processed.
progress: 2/21 labels processed.
progress: 3/21 labels processed.
progress: 4/21 labels processed.
progress: 5/21 labels processed.
progress: 6/21 labels processed.
progress: 7/21 labels processed.
progress: 8/21 labels processed.
progress: 9/21 labels processed.
progress: 10/21 labels processed.
progress: 11/21 labels processed.
progress: 12/21 labels processed.
progress: 13/21 labels processed.
progress: 14/21 labels processed.
progress: 15/21 labels processed.
progress: 16/21 labels processed.
progress: 17/21 labels processed.
progress: 18/21 labels processed.
progress: 19/21 labels processed.
progress: 20/21 labels processed.


Unnamed: 0_level_0,system:index,0_B1,0_B11,0_B12,0_B2,0_B3,0_B4,0_B5,0_B6,0_B7,...,9_B4,9_B5,9_B6,9_B7,9_B8,9_B8A,9_B9,.geo,geometry,label
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tile_43997486_34112017_43998137_34112113,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,769.925866,3322.198887,2644.624125,1128.620894,1753.336923,2010.568300,2634.353258,3302.266379,3455.818704,...,3334.350796,3648.235249,3713.545965,3826.787522,3871.171013,3920.213786,3837.507039,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,...",
tile_44104382_34112843_44105034_34112938,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,298.404905,2157.473569,1582.494005,528.488556,831.356676,934.839237,1352.791826,1881.520708,2069.254223,...,662.603379,1236.062398,2438.234332,2861.378801,3031.939292,3114.423978,2432.149682,"{""type"":""Polygon"",""coordinates"":[[[44.10438435...","POLYGON ((44.10438435337108 34.11293465494744,...",
tile_43998136_34112022_43999221_34112122,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,528.454970,2495.875662,1713.649631,640.474810,1152.119767,1171.487171,1786.087047,3059.746338,3359.432014,...,2124.370565,2445.911989,2515.677113,2636.289053,2605.584472,2756.973423,3091.114321,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,...",
tile_43999220_34112031_44000305_34112130,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,335.551290,2071.815893,1344.805779,429.478019,894.811249,770.613622,1463.251290,2839.394324,3180.367699,...,1854.067183,2060.790668,2105.297921,2202.409155,2225.273773,2299.559797,2382.651482,"{""type"":""Polygon"",""coordinates"":[[[43.99922097...",POLYGON ((43.99922097074953 34.112123037328075...,
tile_44000304_34112040_44001389_34112139,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,285.086948,1878.116894,1177.896530,406.417493,840.937113,721.051941,1302.629802,2778.296159,3145.126910,...,1850.829881,2059.086564,2108.126792,2208.764339,2278.053815,2330.043238,2451.489202,"{""type"":""Polygon"",""coordinates"":[[[44.00030452...","POLYGON ((44.00030452142709 34.11213195754417,...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tile_43910796_34410765_43911896_34411676,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,668.574031,3216.564180,2568.033046,1062.455223,1676.717285,1928.949963,2485.217906,3227.529617,3400.955893,...,2976.241628,3257.947338,3345.804999,3482.857085,3577.533450,3578.926340,3614.595852,"{""type"":""Polygon"",""coordinates"":[[[43.91080588...","POLYGON ((43.910805889628 34.41076691363211, 4...",
tile_43910784_34411666_43911884_34412578,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,598.349524,2971.679810,2268.055660,864.023121,1434.253042,1580.016358,2230.677262,3098.499345,3286.429584,...,2695.897213,2963.722553,3035.604837,3159.457128,3250.330399,3288.109756,3387.261634,"{""type"":""Polygon"",""coordinates"":[[[43.91079693...","POLYGON ((43.91079693382084 34.41166767820974,...",
tile_43910773_34412568_43911872_34413479,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,323.005126,2383.341164,1565.403920,463.461825,965.500234,888.312974,1656.546377,3127.376917,3492.560795,...,2306.326864,2529.101203,2555.877340,2659.343720,2718.644720,2766.098965,2802.713800,"{""type"":""Polygon"",""coordinates"":[[[43.91078360...","POLYGON ((43.9107836093242 34.41256834144177, ...",
tile_43910761_34413470_43911861_34414381,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,230.458565,2142.639905,1354.634793,379.240066,876.236170,753.080647,1475.939422,2946.870295,3318.420955,...,2376.841045,2593.260119,2615.950813,2716.379853,2742.852885,2768.013015,2788.673096,"{""type"":""Polygon"",""coordinates"":[[[43.91077465...",POLYGON ((43.910774653512924 34.41346909008774...,


In [None]:
c.fit(kmeans_clustering)

Clustering 61916 points from subregion.
temp columns: Index(['cluster_label_0'], dtype='object')
1 columns in the matrix after clustering.
61916 rows in the matrix after clustering.


Unnamed: 0_level_0,cluster_label_0,.geo
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1
tile_43637469_34558614_43638574_34559528,,"{""type"":""Polygon"",""coordinates"":[[[43.63748527..."
tile_43637484_34557712_43638589_34558626,,"{""type"":""Polygon"",""coordinates"":[[[43.63749867..."
tile_43637499_34556811_43638603_34557725,,"{""type"":""Polygon"",""coordinates"":[[[43.63751207..."
tile_43637514_34555909_43638618_34556823,,"{""type"":""Polygon"",""coordinates"":[[[43.63752986..."
tile_43637528_34555008_43638633_34555922,,"{""type"":""Polygon"",""coordinates"":[[[43.63754326..."
...,...,...
tile_44187802_33891657_44188892_33892565,,"{""type"":""Polygon"",""coordinates"":[[[44.18780074..."
tile_44187810_33890755_44188900_33891664,,"{""type"":""Polygon"",""coordinates"":[[[44.18780972..."
tile_44187819_33889853_44188909_33890762,,"{""type"":""Polygon"",""coordinates"":[[[44.18781862..."
tile_44187827_33888951_44188917_33889860,,"{""type"":""Polygon"",""coordinates"":[[[44.18782753..."


In [None]:
c.data.head()

Unnamed: 0_level_0,system:index,0_B1,0_B11,0_B12,0_B2,0_B3,0_B4,0_B5,0_B6,0_B7,...,9_B3,9_B4,9_B5,9_B6,9_B7,9_B8,9_B8A,9_B9,.geo,geometry
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tile_43997486_34112017_43998137_34112113,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,769.925866,3322.198887,2644.624125,1128.620894,1753.336923,2010.5683,2634.353258,3302.266379,3455.818704,...,2541.523502,3334.350796,3648.235249,3713.545965,3826.787522,3871.171013,3920.213786,3837.507039,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,..."
tile_44104382_34112843_44105034_34112938,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,298.404905,2157.473569,1582.494005,528.488556,831.356676,934.839237,1352.791826,1881.520708,2069.254223,...,801.668174,662.603379,1236.062398,2438.234332,2861.378801,3031.939292,3114.423978,2432.149682,"{""type"":""Polygon"",""coordinates"":[[[44.10438435...","POLYGON ((44.10438435337108 34.11293465494744,..."
tile_43998136_34112022_43999221_34112122,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,528.45497,2495.875662,1713.649631,640.47481,1152.119767,1171.487171,1786.087047,3059.746338,3359.432014,...,1593.018742,2124.370565,2445.911989,2515.677113,2636.289053,2605.584472,2756.973423,3091.114321,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,..."
tile_43999220_34112031_44000305_34112130,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,335.55129,2071.815893,1344.805779,429.478019,894.811249,770.613622,1463.25129,2839.394324,3180.367699,...,1394.102079,1854.067183,2060.790668,2105.297921,2202.409155,2225.273773,2299.559797,2382.651482,"{""type"":""Polygon"",""coordinates"":[[[43.99922097...",POLYGON ((43.99922097074953 34.112123037328075...
tile_44000304_34112040_44001389_34112139,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,285.086948,1878.116894,1177.89653,406.417493,840.937113,721.051941,1302.629802,2778.296159,3145.12691,...,1386.38706,1850.829881,2059.086564,2108.126792,2208.764339,2278.053815,2330.043238,2451.489202,"{""type"":""Polygon"",""coordinates"":[[[44.00030452...","POLYGON ((44.00030452142709 34.11213195754417,..."


In [None]:
c.labels.head()

Unnamed: 0_level_0,system:index,0_B1,0_B11,0_B12,0_B2,0_B3,0_B4,0_B5,0_B6,0_B7,...,9_B4,9_B5,9_B6,9_B7,9_B8,9_B8A,9_B9,.geo,geometry,label
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tile_43997486_34112017_43998137_34112113,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,769.925866,3322.198887,2644.624125,1128.620894,1753.336923,2010.5683,2634.353258,3302.266379,3455.818704,...,3334.350796,3648.235249,3713.545965,3826.787522,3871.171013,3920.213786,3837.507039,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,...",
tile_44104382_34112843_44105034_34112938,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,298.404905,2157.473569,1582.494005,528.488556,831.356676,934.839237,1352.791826,1881.520708,2069.254223,...,662.603379,1236.062398,2438.234332,2861.378801,3031.939292,3114.423978,2432.149682,"{""type"":""Polygon"",""coordinates"":[[[44.10438435...","POLYGON ((44.10438435337108 34.11293465494744,...",
tile_43998136_34112022_43999221_34112122,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,528.45497,2495.875662,1713.649631,640.47481,1152.119767,1171.487171,1786.087047,3059.746338,3359.432014,...,2124.370565,2445.911989,2515.677113,2636.289053,2605.584472,2756.973423,3091.114321,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,...",
tile_43999220_34112031_44000305_34112130,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,335.55129,2071.815893,1344.805779,429.478019,894.811249,770.613622,1463.25129,2839.394324,3180.367699,...,1854.067183,2060.790668,2105.297921,2202.409155,2225.273773,2299.559797,2382.651482,"{""type"":""Polygon"",""coordinates"":[[[43.99922097...",POLYGON ((43.99922097074953 34.112123037328075...,
tile_44000304_34112040_44001389_34112139,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,285.086948,1878.116894,1177.89653,406.417493,840.937113,721.051941,1302.629802,2778.296159,3145.12691,...,1850.829881,2059.086564,2108.126792,2208.764339,2278.053815,2330.043238,2451.489202,"{""type"":""Polygon"",""coordinates"":[[[44.00030452...","POLYGON ((44.00030452142709 34.11213195754417,...",


In [None]:
temp_labells_2 = c.labels

temp_labells_2 = temp_labells_2.reset_index(drop=False)
# only keep ['file_name', 'label', '.geo'] and save as csv as standard pandas not GeoDataFrame
temp_labells_2 = temp_labells_2[['file_name', 'label']]
# temp_labells_2['file_name'] = temp_labells_2['file_name'].astype(str) 

temp_labells_2.head()


Unnamed: 0,file_name,label
0,tile_43997486_34112017_43998137_34112113,
1,tile_44104382_34112843_44105034_34112938,
2,tile_43998136_34112022_43999221_34112122,
3,tile_43999220_34112031_44000305_34112130,
4,tile_44000304_34112040_44001389_34112139,


In [None]:
# reload with data 
temp_data_labells_2 = c.data.copy()
temp_data_labells_2 = temp_data_labells_2.reset_index(drop=False)

temp_labells_2 = temp_labells_2.merge(temp_data_labells_2, on='file_name', how='left')
temp_labells_2.set_index('file_name', inplace=True)

In [89]:
temp_labells_2.head()

Unnamed: 0_level_0,label,system:index_x,0_B1_x,0_B11_x,0_B12_x,0_B2_x,0_B3_x,0_B4_x,0_B5_x,0_B6_x,...,9_B3_y,9_B4_y,9_B5_y,9_B6_y,9_B7_y,9_B8_y,9_B8A_y,9_B9_y,.geo_y,geometry_y
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tile_43997486_34112017_43998137_34112113,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,769.925866,3322.198887,2644.624125,1128.620894,1753.336923,2010.5683,2634.353258,3302.266379,...,2541.523502,3334.350796,3648.235249,3713.545965,3826.787522,3871.171013,3920.213786,3837.507039,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,..."
tile_44104382_34112843_44105034_34112938,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,298.404905,2157.473569,1582.494005,528.488556,831.356676,934.839237,1352.791826,1881.520708,...,801.668174,662.603379,1236.062398,2438.234332,2861.378801,3031.939292,3114.423978,2432.149682,"{""type"":""Polygon"",""coordinates"":[[[44.10438435...","POLYGON ((44.10438435337108 34.11293465494744,..."
tile_43998136_34112022_43999221_34112122,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,528.45497,2495.875662,1713.649631,640.47481,1152.119767,1171.487171,1786.087047,3059.746338,...,1593.018742,2124.370565,2445.911989,2515.677113,2636.289053,2605.584472,2756.973423,3091.114321,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,..."
tile_43999220_34112031_44000305_34112130,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,335.55129,2071.815893,1344.805779,429.478019,894.811249,770.613622,1463.25129,2839.394324,...,1394.102079,1854.067183,2060.790668,2105.297921,2202.409155,2225.273773,2299.559797,2382.651482,"{""type"":""Polygon"",""coordinates"":[[[43.99922097...",POLYGON ((43.99922097074953 34.112123037328075...
tile_44000304_34112040_44001389_34112139,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,285.086948,1878.116894,1177.89653,406.417493,840.937113,721.051941,1302.629802,2778.296159,...,1386.38706,1850.829881,2059.086564,2108.126792,2208.764339,2278.053815,2330.043238,2451.489202,"{""type"":""Polygon"",""coordinates"":[[[44.00030452...","POLYGON ((44.00030452142709 34.11213195754417,..."


In [None]:
c.points.head()

Unnamed: 0,geometry,label
0,POINT (43.84178 34.25368),Water
1,POINT (43.81311 34.29368),Water
2,POINT (43.81045 34.31297),Water
3,POINT (43.78648 34.34544),Water
4,POINT (43.76761 34.38654),Water


In [None]:
c.subregions.head()

Unnamed: 0,geometry,count,file_name,label,tile_statu
0,"POLYGON ((43.91285 34.33684, 43.91574 34.11141...",6250000,tile_43641125_34108721_43915744_34336837,3000000,acceptable


In [None]:
def save_state(c, filname_prefix="temp_test/test2", filename_postfix="_state"):
    """
        Saves the current state of the cluster object to files for later use, only preserving class specifc data and not the spectral/time-series data.
    """


    # Save the Sparse Matrix (if exists)
    if c.df_sparse.empty:
        print("No sparse matrix to save.")
    else:
        df_sparse = c.df_sparse.copy()
        df_sparse = df_sparse.reset_index(drop=False) # Preserve index as its a uniuqe identifier external to class
        df_sparse.to_csv(f"{filname_prefix}{filename_postfix}_sparse_matrix.csv", index=False)


    # Save the Points and Labelled Data
    if c.points.empty:
        print("No points to save.")
    else:
        c.points.to_file(f"{filname_prefix}{filename_postfix}_points.geojson", driver='GeoJSON')


    # Save Labelled Data 
    if c.gdf_labels.empty:
        print("No assigned labels to save.")
    else:
        df_labels = c.gdf_labels.copy()
        df_labels = df_labels.reset_index(drop=False)
        df_labels = df_labels[['file_name', 'label']]  # Keep only core columns (.geo comes back later with data)
        df_labels.to_csv(f"{filname_prefix}{filename_postfix}_labels.csv", index=False) 


    # Save Subregions
    subregions = c.subregions.copy()
    if subregions.empty:
        print("No subregions to save.")
    else:
        subregions.to_file(f"{filname_prefix}{filename_postfix}_subregions.geojson", driver='GeoJSON')     

def reload_state(c, filname_prefix="temp_test/test2", filename_postfix="_state"):
    """
        Reloads the state of the cluster object from files.
    """

    # Load Sparse Matrix (if exists)
    df_sparse = pd.read_csv(f"{filname_prefix}{filename_postfix}_sparse_matrix.csv")
    df_sparse = df_sparse.set_index("file_name")
    # c.df_sparse = df_sparse
    

    # Load Points (if exists)
    dir_points = f"{filname_prefix}{filename_postfix}_points.geojson"
    with fiona.open(dir_points) as src:
            points = gpd.GeoDataFrame.from_features(src, crs=src.crs)
    # c.points = points


    # Load Labels (if exists)
    data = c.data.copy()
    data = data.reset_index(drop=False)
    gdf_labels = pd.read_csv(f"{filname_prefix}{filename_postfix}_labels.csv")
    gdf_labels = gdf_labels.merge(data, on='file_name', how='inner') # ensures re-useabilty of labells on other data 
    gdf_labels['label'] = gdf_labels['label'].replace({np.NaN: None, "NaN": None, "None": None})
    gdf_labels = gdf_labels.set_index("file_name") 
    # c.gdf_labels = gdf_labels

    
    # Load Subregions (if exists)
    dir_subregions = f"{filname_prefix}{filename_postfix}_subregions.geojson"
    with fiona.open(dir_subregions) as src:
        subregions = gpd.GeoDataFrame.from_features(src, crs=src.crs)
    # c.subregions = subregions
    
    return df_sparse, points, gdf_labels, subregions

save_state(c, filname_prefix="temp_test/test3", filename_postfix="_state")
temp_test3_sparse, temp_test3_points, temp_test4_labells, subregions = reload_state(self, filname_prefix="temp_test/test3", filename_postfix="_state")

In [None]:
c.labels.head()

Unnamed: 0_level_0,system:index,0_B1,0_B11,0_B12,0_B2,0_B3,0_B4,0_B5,0_B6,0_B7,...,9_B4,9_B5,9_B6,9_B7,9_B8,9_B8A,9_B9,.geo,geometry,label
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tile_43997486_34112017_43998137_34112113,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,769.925866,3322.198887,2644.624125,1128.620894,1753.336923,2010.5683,2634.353258,3302.266379,3455.818704,...,3334.350796,3648.235249,3713.545965,3826.787522,3871.171013,3920.213786,3837.507039,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,...",
tile_44104382_34112843_44105034_34112938,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,298.404905,2157.473569,1582.494005,528.488556,831.356676,934.839237,1352.791826,1881.520708,2069.254223,...,662.603379,1236.062398,2438.234332,2861.378801,3031.939292,3114.423978,2432.149682,"{""type"":""Polygon"",""coordinates"":[[[44.10438435...","POLYGON ((44.10438435337108 34.11293465494744,...",
tile_43998136_34112022_43999221_34112122,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,528.45497,2495.875662,1713.649631,640.47481,1152.119767,1171.487171,1786.087047,3059.746338,3359.432014,...,2124.370565,2445.911989,2515.677113,2636.289053,2605.584472,2756.973423,3091.114321,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,...",
tile_43999220_34112031_44000305_34112130,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,335.55129,2071.815893,1344.805779,429.478019,894.811249,770.613622,1463.25129,2839.394324,3180.367699,...,1854.067183,2060.790668,2105.297921,2202.409155,2225.273773,2299.559797,2382.651482,"{""type"":""Polygon"",""coordinates"":[[[43.99922097...",POLYGON ((43.99922097074953 34.112123037328075...,
tile_44000304_34112040_44001389_34112139,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,285.086948,1878.116894,1177.89653,406.417493,840.937113,721.051941,1302.629802,2778.296159,3145.12691,...,1850.829881,2059.086564,2108.126792,2208.764339,2278.053815,2330.043238,2451.489202,"{""type"":""Polygon"",""coordinates"":[[[44.00030452...","POLYGON ((44.00030452142709 34.11213195754417,...",


In [118]:
temp_test4_labells.head()

Unnamed: 0_level_0,label,system:index,0_B1,0_B11,0_B12,0_B2,0_B3,0_B4,0_B5,0_B6,...,9_B3,9_B4,9_B5,9_B6,9_B7,9_B8,9_B8A,9_B9,.geo,geometry
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tile_43997486_34112017_43998137_34112113,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,769.925866,3322.198887,2644.624125,1128.620894,1753.336923,2010.5683,2634.353258,3302.266379,...,2541.523502,3334.350796,3648.235249,3713.545965,3826.787522,3871.171013,3920.213786,3837.507039,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,..."
tile_44104382_34112843_44105034_34112938,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,298.404905,2157.473569,1582.494005,528.488556,831.356676,934.839237,1352.791826,1881.520708,...,801.668174,662.603379,1236.062398,2438.234332,2861.378801,3031.939292,3114.423978,2432.149682,"{""type"":""Polygon"",""coordinates"":[[[44.10438435...","POLYGON ((44.10438435337108 34.11293465494744,..."
tile_43998136_34112022_43999221_34112122,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,528.45497,2495.875662,1713.649631,640.47481,1152.119767,1171.487171,1786.087047,3059.746338,...,1593.018742,2124.370565,2445.911989,2515.677113,2636.289053,2605.584472,2756.973423,3091.114321,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,..."
tile_43999220_34112031_44000305_34112130,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,335.55129,2071.815893,1344.805779,429.478019,894.811249,770.613622,1463.25129,2839.394324,...,1394.102079,1854.067183,2060.790668,2105.297921,2202.409155,2225.273773,2299.559797,2382.651482,"{""type"":""Polygon"",""coordinates"":[[[43.99922097...",POLYGON ((43.99922097074953 34.112123037328075...
tile_44000304_34112040_44001389_34112139,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,285.086948,1878.116894,1177.89653,406.417493,840.937113,721.051941,1302.629802,2778.296159,...,1386.38706,1850.829881,2059.086564,2108.126792,2208.764339,2278.053815,2330.043238,2451.489202,"{""type"":""Polygon"",""coordinates"":[[[44.00030452...","POLYGON ((44.00030452142709 34.11213195754417,..."


In [109]:
temp_test4_labells.head()

Unnamed: 0_level_0,label,system:index,0_B1,0_B11,0_B12,0_B2,0_B3,0_B4,0_B5,0_B6,...,9_B3,9_B4,9_B5,9_B6,9_B7,9_B8,9_B8A,9_B9,.geo,geometry
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tile_43997486_34112017_43998137_34112113,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,769.925866,3322.198887,2644.624125,1128.620894,1753.336923,2010.5683,2634.353258,3302.266379,...,2541.523502,3334.350796,3648.235249,3713.545965,3826.787522,3871.171013,3920.213786,3837.507039,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,..."
tile_44104382_34112843_44105034_34112938,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,298.404905,2157.473569,1582.494005,528.488556,831.356676,934.839237,1352.791826,1881.520708,...,801.668174,662.603379,1236.062398,2438.234332,2861.378801,3031.939292,3114.423978,2432.149682,"{""type"":""Polygon"",""coordinates"":[[[44.10438435...","POLYGON ((44.10438435337108 34.11293465494744,..."
tile_43998136_34112022_43999221_34112122,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,528.45497,2495.875662,1713.649631,640.47481,1152.119767,1171.487171,1786.087047,3059.746338,...,1593.018742,2124.370565,2445.911989,2515.677113,2636.289053,2605.584472,2756.973423,3091.114321,"{""type"":""Polygon"",""coordinates"":[[[43.99813739...","POLYGON ((43.99813739573809 34.11211410383254,..."
tile_43999220_34112031_44000305_34112130,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,335.55129,2071.815893,1344.805779,429.478019,894.811249,770.613622,1463.25129,2839.394324,...,1394.102079,1854.067183,2060.790668,2105.297921,2202.409155,2225.273773,2299.559797,2382.651482,"{""type"":""Polygon"",""coordinates"":[[[43.99922097...",POLYGON ((43.99922097074953 34.112123037328075...
tile_44000304_34112040_44001389_34112139,,1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_1_...,285.086948,1878.116894,1177.89653,406.417493,840.937113,721.051941,1302.629802,2778.296159,...,1386.38706,1850.829881,2059.086564,2108.126792,2208.764339,2278.053815,2330.043238,2451.489202,"{""type"":""Polygon"",""coordinates"":[[[44.00030452...","POLYGON ((44.00030452142709 34.11213195754417,..."


In [21]:
# OLD OLD OLD OLD OLD OLD OLD OLD 

# def save_processed_label_data(geodf, filename):
#     """Saves the processed GeoDataFrame to a file."""
#     if isinstance(geodf, gpd.GeoDataFrame):
#         geodf.to_file(filename, driver='GeoJSON')
#         print(f"GeoDataFrame saved to {filename}")
#     elif isinstance(geodf, pd.DataFrame):
#         geodf.to_csv(filename, index=True)
#         print(f"DataFrame saved to {filename}")

# # save_processed_label_data(c.gdf_labels, "temp_test/test1_labels.geojson")
# save_processed_label_data(c.df_sparse, "temp_test/test1_sparse.csv")
# save_processed_label_data(c.data, "temp_test/test1_data.geojson")
# save_processed_label_data(c.points, "temp_test/test1_points.geojson")
# save_processed_label_data(c.subregions, "temp_test/test1_subregions.geojson")


# OLDER OLDER OLDER OLDER OLDER 
# # save c.df_sparse to a file #TODOFINDMEREF
# save_processed_label_data(c.df_sparse, "data/df_sparse_data.json")
# save_processed_label_data(c.gdf_labels, "data/labels.geojson")
###save_processed_label_data(c.data, "data/subregions.geojson") # may be useless 

In [22]:
# wasteland_update_map = gen_basemap(basemap_url, aoi=chunks)
# wasteland_update_map

In [23]:
# wasteland_update_labels = collect_points_from_geemap(wasteland_update_map, label="Wasteland")
# c.update_row_labels(wasteland_update_labels)
# c.create_map("/temptest3.tif")

In [24]:
class ts_cluster():

    yearly_clusters = []

    def __init__(self, ts_point_labels, data_dir, mapping, passes=6):
        """
            Initializes the ts_cluster object with time-series point labels, data directory, mapping, and number of passes.
        """
        self.ts_point_labels = ts_point_labels          # expected labels in format year_start, year_end, label and point geometry 
        self.data_dir = data_dir                        # expected to have file_dir and year columns in pd.DataFrame format  
        self.mapping = mapping                          # mapping for labels to numeric values 
        self.passes = passes                            # int 
    

    def __create_shared_label_list__(self):
        """
            Creates a list of shared labels acrross ALL time-series points, such that it can act as a base for labels in the clustering process.
        """

        shared_labels = pd.DataFrame(columns=["year_start", "year_end", "label", "geometry"])
        non_shared_labels = pd.DataFrame(columns=["year_start", "year_end", "label", "geometry"])

        min_year = self.data_dir['year'].min()
        max_year = self.data_dir['year'].max()

        # if years between (incluside) are in ts_point_labels, then add to shared_labels else add to non_shared_labels
        for year in range(min_year, max_year + 1):
            if year in self.ts_point_labels['year_start'].values:
                shared_labels = shared_labels.append(self.ts_point_labels[self.ts_point_labels['year_start'] == year], ignore_index=True)

        # else 
        non_shared_labels = self.ts_point_labels[~self.ts_point_labels['year_start'].isin(shared_labels['year_start'])]

    

In [25]:
# x_1 = 2019
# x_2 = 2024

# for y in range(x_1, x_2 + 1):
#     print(f"Processing year: {y}")