In [1]:
import os
import json
import fiona
import geemap
import numpy as np
import pandas as pd
import geopandas as gpd
import xml.etree.ElementTree as ET

from collections import Counter

from shapely.geometry import shape
from shapely.geometry import Point

# Gee and EE 
import ee

# Clustering (best for testing** due to speed)
from sklearn.cluster import KMeans

# tif file creation
import rasterio
from rasterio.transform import from_origin
from rasterio.features import rasterize

# Plotting and Vis 
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

import re

In [2]:
# Initialise
ee.Authenticate()
ee.Initialize(project="jameswilliamchamberlain")

In [3]:
# basemap 
basemap_url = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'

# centeral point of Samarra 
df_sites = pd.DataFrame({
    "longitude": [43.823543],
    "latitude": [34.340989],
    "name": ["Samarra Archaeological City"],
    "category": ["Cultural"],
    "date inscribed": ["2007"],
    "region": ["Arab States"],
    "url": ["https://whc.unesco.org/en/list/276"],
    "iso": [["IQ"]]
})

# Chunks of Samarra Archaeological City 
with fiona.open("chunks_new.shp") as src:
    chunks = gpd.GeoDataFrame.from_features(src, crs=src.crs)

In [4]:
## TEMP REMOVE ADDITIONAL STUFF FOR QUICKER TESTING 
# tile_43641125_34108721_43915744_34336837 only 

# chunks = chunks[chunks['file_name'] == 'tile_43641125_34108721_43915744_34336837']

In [5]:
m = geemap.Map()

if df_sites.empty:
    print("No sites found for the specified URL.")
else:
    m.add_points_from_xy(df_sites, x="longitude", y="latitude", layer_name="Sites")
    center_points = df_sites[['longitude', 'latitude']].mean().values
    m.setCenter(center_points[0], center_points[1], 10)

m.add_basemap(basemap_url, name="Google Satellite", attribution="Google")

# add chunks from aoi 
m.add_gdf(chunks, layer_name="AOI", style={"color": "red", "fillColor": "red", "fillOpacity": 0.1})

m

Map(center=[34.340989, 43.823543], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=Sear…

In [6]:
def create_subregions(chunks, sift_percentage_lon=0.5, sift_percentage_lat=0.415):
    """
        Shifts all polygons to 8 positions based on half the length in longitude and latitude, to create subregions - one for each direction from the center.

        Assumes aoi contains all similar polygons, and are similar to a square.

        Parameters:
            aoi (GeoDataFrame): The area of interest containing geometries.
            sift_percentage (float): Percentage of the length to shift the center point. Default is 0.5 (50%) for half the length.
    """

    if chunks.empty:
        return {}
    
    first_polygon = chunks.geometry.iloc[0]

    # take top two points of the polygon and get the length between them
    top_points = first_polygon.exterior.coords[:2]
    length_lon = abs(top_points[0][0] - top_points[1][0])

    # calculate the shift amount
    shift_amount_lon = length_lon * sift_percentage_lon 
    shift_amount_lat = length_lon * sift_percentage_lat

    shift_directions = {
        "left": (-shift_amount_lon, 0),
        "right": (shift_amount_lon, 0),
        "up": (0, shift_amount_lat),
        "down": (0, -shift_amount_lat),
        "top_left": (-shift_amount_lon, shift_amount_lat),
        "top_right": (shift_amount_lon, shift_amount_lat),
        "bottom_left": (-shift_amount_lon, -shift_amount_lat),
        "bottom_right": (shift_amount_lon, -shift_amount_lat),
    }

    subregions = []
    aoi = chunks.copy()
    aoi = aoi.dissolve()
        
    # Create subregions by shifting the geometries in all directions
    for _, (dx, dy) in shift_directions.items():
        gdf_shifted = chunks.copy()
        gdf_shifted["geometry"] = gdf_shifted["geometry"].translate(dx, dy)

        if subregions is not []:
            for region in subregions:
                for polygon in gdf_shifted.geometry:
                    for region_polygon in region.geometry:
                        if polygon.intersects(region_polygon):
                            # calculate % of intersection
                            intersection = polygon.intersection(region_polygon)
                            intersection_area = intersection.area
                            region_area = region_polygon.area
                            intersection_percentage = intersection_area / region_area if region_area > 0 else 0
                            
                            if intersection_percentage > 0.5: 
                                # if too high of an overlap skip
                                print(f"Dropping polygon from subregion due to high overlap: {intersection_percentage:.2%} with existing region.")
                                gdf_shifted = gdf_shifted[gdf_shifted.geometry != polygon]
                                break         

        subregions.append(gdf_shifted)

    # Combine all into one GeoDataFrame
    subregions = pd.concat(subregions, ignore_index=True)

    # Reset index and return as GeoDataFrame
    subregions = subregions.reset_index(drop=True)

    return gpd.GeoDataFrame(subregions, crs=chunks.crs)

# kept despite not being used as it may be useful in the future
# def clip(chunks, aoi):
#     """
#         Clips the chunks or subregions to the area of interest (aoi).
#     """

#     # clip to aoi 
#     if chunks.empty or aoi.empty:
#         return gpd.GeoDataFrame(columns=chunks.columns.tolist(), crs=chunks.crs)
#     clipped = gpd.clip(chunks, aoi)
#     clipped = clipped[clipped.geometry.notnull()]

#     return clipped.reset_index(drop=True)


subregions = create_subregions(chunks)
subregions = pd.concat([subregions, chunks], ignore_index=True)

# plot as one 
m.add_gdf(subregions, layer_name="Subregions", style={"color": "blue", "fillColor": "blue", "fillOpacity": 0.1})

Dropping polygon from subregion due to high overlap: 98.80% with existing region.
Dropping polygon from subregion due to high overlap: 98.54% with existing region.
Dropping polygon from subregion due to high overlap: 98.78% with existing region.
Dropping polygon from subregion due to high overlap: 99.16% with existing region.
Dropping polygon from subregion due to high overlap: 99.14% with existing region.
Dropping polygon from subregion due to high overlap: 98.89% with existing region.
Dropping polygon from subregion due to high overlap: 98.87% with existing region.
Dropping polygon from subregion due to high overlap: 50.70% with existing region.
Dropping polygon from subregion due to high overlap: 50.56% with existing region.
Dropping polygon from subregion due to high overlap: 50.84% with existing region.
Dropping polygon from subregion due to high overlap: 51.01% with existing region.
Dropping polygon from subregion due to high overlap: 50.89% with existing region.
Dropping polygon

# Reference Points

In [7]:
def collect_points_from_geemap(map_obj, label):
    """
    Collect all drawn point features from a geemap.Map that uses ee.Feature objects,
    and return them as a GeoDataFrame with a label.

    Parameters:
        map_obj (geemap.Map): The interactive map.
        label (str): Label to assign to all collected points.

    Returns:
        GeoDataFrame: With geometry and 'label' columns.
    """

    features = map_obj.draw_features

    if not features:
        return gpd.GeoDataFrame(columns=["geometry", "label"], geometry="geometry")

    points = []
    for f in features:
        try:
            geom = f.geometry()  # call the method
            if geom.getInfo()["type"] == "Point":
                coords = geom.coordinates().getInfo()  # [lon, lat]
                points.append(Point(coords))
        except Exception as e:
            print("Skipping feature due to error:", e)

    if not points:
        return gpd.GeoDataFrame(columns=["geometry", "label"], geometry="geometry")

    gdf = gpd.GeoDataFrame(geometry=points)
    gdf["label"] = label
    gdf.set_crs("EPSG:4326", inplace=True)
    return gdf

def gen_basemap(basemap_url=None, aoi=gpd.GeoDataFrame(), polygons=None):
    """
        Generates a basemap with the specified URL and adds polygons if provided.

        Parameters:
            basemap_url (str): The URL of the basemap to be used.
            polygons (GeoDataFrame, optional): Polygons to be added to the map. Defaults to None.
            aoi (GeoDataFrame): The area of interest to be displayed on the map. (can be a set of polygons or a single polygon)
        
        Returns:
            geemap.Map: A geemap map object with the specified basemap and polygons.
    """
    m = geemap.Map()

    if not basemap_url:
        basemap_url = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'
        
    m.add_basemap(basemap_url, name="Google Satellite", attribution="Google")

    # center on aoi
    if not aoi.empty:
        m.add_gdf(aoi, layer_name="AOI", style={"color": "red", "fillColor": "red", "fillOpacity": 0.1})
        center_points = aoi.geometry.unary_union.centroid.coords[0]
        m.setCenter(center_points[0], center_points[1], 10)
    
    if polygons is not None:
        m.add_gdf(polygons, layer_name="Polygons", style={"color": "red", "fillColor": "red", "fillOpacity": 0.1})
    
    return m

# Clustering
Here we create a Sparse matrix

then with the sparse matrix compare and label based on that

In [8]:
import os
import cluster as cl 
from cluster import kmeans_clustering
import ee
import geemap
from ipyleaflet import CircleMarker
import ipywidgets as widgets
import pandas as pd 

In [9]:
pth = "data/"

# collect paths for all csv files in the folder
paths = []

for root, dirs, files in os.walk(pth):
    for file in files:
        if file.endswith(".csv"):
            paths.append(os.path.join(root, file))

print(f"Found {len(paths)} CSV files.")


Found 26 CSV files.


In [10]:
def prep_data(dir):
    """Creates two dataframes from a CSV file, one ready for clustering with additional columns removed and the other with all columns left intact."""
    df1 = pd.read_csv(dir)
    df1 = df1.dropna()
    df2_clear = df1.copy()
    df2_clear = df2_clear.drop(columns=["system:index", ".geo"])
    df2_clear = df2_clear.set_index("file_name")
    df1 = df1.set_index("file_name")
    df2_clear = df2_clear.apply(pd.to_numeric, errors='coerce')

    return df1, df2_clear

# df1, df2_clear = prep_data(path[0])

mapping = {
            None: 0,
            np.NaN: 0,
            "NaN": 0,
            "None": 0,
            "Water": 1,
            "Wetland": 2,
            "Agricultural": 3,
            "Urban": 4,
            "Wasteland": 5,
        }

In [11]:
label_list = ['Urban', 'Agricultural', 'Water', 'Wasteland', 'Wetland', 'Other']
date_range = [i for i in range(2019, 2025)] 

collection_name = "COPERNICUS/S2_SR_HARMONIZED"

with fiona.open("aoi.geojson") as src:
    aoi = gpd.GeoDataFrame.from_features(src, crs=src.crs)
 

In [12]:
# empty label list 
labels = gpd.GeoDataFrame(columns=["label", "geometry", "start_year", "end_year"], geometry="geometry")

In [13]:
m = geemap.Map()

def map_polygon(polygon, collection_name, layer_name, yyyymmdd1="2024-01-01", yyyymmdd2="2024-12-29", num_tasks=10):

    # Load in Year Data to Aid in Label Creation
    collection = ee.ImageCollection(collection_name) \
        .filterDate(yyyymmdd1, yyyymmdd2) \
        .filterBounds(polygon) \
        .filter(ee.Filter.lt("CLOUDY_PIXEL_PERCENTAGE", 20)) \
        .median() \
        .clip(polygon)
    
    vis = {'min': 0, 'max': 3000, 'bands': ['B4', 'B3', 'B2']}

    m.addLayer(collection, vis, layer_name)

    # center map on the polygon
    coords = polygon.geometry().centroid().coordinates().getInfo()
    m.setCenter(coords[0], coords[1], 10)

label_dropdown = widgets.Dropdown(
    options=label_list,
    description='Label:',
    value=label_list[0],
)

# Plots Each Year
for year in date_range:
    map_polygon(geemap.geopandas_to_ee(aoi), collection_name, f"Year {year}", yyyymmdd1=f"{year}-01-01", yyyymmdd2=f"{year}-12-29")

date_slider = widgets.SelectionSlider(
    options=[str(year) for year in range(2019, 2025)], # 2019 to 2024 for full years
    value='2019',
    description='Date:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True
)

time_frame_slider = widgets.SelectionSlider(
    options=[str(month) for month in range(1, 13)], # default 12 as its the full year 
    value='3',
    description='timeframe:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True
)

colour_dict = {
    "Urban": "blue",
    "Agricultural": "green",
    "Water": "cyan",
    "Wasteland": "brown",
    "Wetland": "lightblue", 
} # other is grey by default

def append_label_to_points(b):
    """Appends the selected label to the points in the labels GeoDataFrame."""

    global labels

    selected_label = label_dropdown.value
    start_year = int(date_slider.value)
    span = int(time_frame_slider.value)
    end_year = start_year + span - 1

    # Extract drawn features (assumed to be Points)
    all_drawn_features = m.draw_features
    if not all_drawn_features:
        print("No features drawn.")
        return
    
    points = []
    for feature in all_drawn_features:
        try:
            coords = feature.geometry().coordinates().getInfo()  # [lon, lat]
            point = Point(coords)
            points.append(point)
        except Exception as e:
            print("Skipping feature due to error:", e)

    # create dataframe from points
    new_labels = gpd.GeoDataFrame({
        'label': [selected_label] * len(points),
        'start_year': [start_year] * len(points),
        'end_year': [end_year] * len(points),
        'geometry': points
    }, crs="EPSG:4326")

    # Clear All Drawn Features
    m.draw_control.clear()


    # Append to the existing labels GeoDataFrame
    labels = pd.concat([labels, new_labels], ignore_index=True)
    

# Label Range  
label_output = widgets.Label()
def update_label(*args):
    start_year = int(date_slider.value)
    span = int(time_frame_slider.value)
    end_year = start_year + span - 1
    label_output.value = f"Selected range: {start_year} to {end_year}"

date_slider.observe(update_label, names='value')
time_frame_slider.observe(update_label, names='value')

update_label() 

append_btn = widgets.Button(description="Append Label to Points")
append_btn.on_click(append_label_to_points)

# display(widgets.HBox([date_slider, month_slider, time_frame_slider, render_btn]))
display(widgets.VBox([
    widgets.HBox([date_slider, time_frame_slider]),
    widgets.HBox([label_dropdown, label_output, append_btn]),
]))

m

VBox(children=(HBox(children=(SelectionSlider(continuous_update=False, description='Date:', options=('2019', '…

Map(center=[34.22411402596497, 43.91430818231603], controls=(WidgetControl(options=['position', 'transparent_b…

In [14]:
labels

Unnamed: 0,label,geometry,start_year,end_year


In [15]:
raise ValueError("BREAK POINT")

ValueError: BREAK POINT

In [16]:
import cluster as cl

data_dir = pd.DataFrame({
    "dir": [f"data/{year}.csv" for year in range(2019, 2025)],
    "year": [year for year in range(2019, 2025)]
})

class ts_cluster():

    cluster_list = []

    yearly_labels_dict = []
    common_labels = pd.DataFrame(columns=["year_start", "year_end", "label", "geometry"])
    
    data_dir = pd.DataFrame(columns=["dir", "year"])

    start_year = 2019
    end_year = 2024

    def __init__(self, ts_point_labels, data_dir, mapping, start_year, end_year, index_column='file_name', passes=6, full_data=True):
        """
            Initializes the ts_cluster object with time-series point labels, data directory, mapping, and number of passes.
        """
        
        self.start_year = int(start_year)
        self.end_year = int(end_year)
        self.full_data = full_data
        self.index_column = index_column

        self.data_dir = data_dir

        if start_year > end_year:
            raise ValueError("start_year must be less than or equal to end_year")
        
        common_labels, uncommon_labels = self.__reduce_to_common__(ts_point_labels)
        self.common_labels = common_labels
        self.yearly_labels_dict = {
            year: self.__split_year__(year, uncommon_labels)
            for year in range(start_year, end_year + 1)
        }

        # runtime vars 
        self.mapping = mapping
        self.passes = passes

    def __reduce_to_year__(year, label_df):
        """
            Returns a DataFrame with only entries that overlap with the specified year.

        Args:
            year (int):                The year to filter the labels DataFrame.
            label_df (GeoDataFrame):   The labels DataFrame to filter.

        Returns:
            GeoDataFrame:              A DataFrame containing only the labels that overlap with the specified year.
        """

        year_df = label_df[(label_df['start_year'] <= year) & (label_df['end_year'] >= year)]
        return year_df.reset_index(drop=True)


    def __reduce_to_common__(self, labels_df):
        """
        Splits the labels DataFrame into two parts:
        1. Common labels that fall within the specified start and end years.
        2. Uncommon labels that do not fall within the specified range, and may be year specific.
        
        Args:
            labels_df (GeoDataFrame):   The labels DataFrame to reduce.
            start_year (int):           The starting year for the reduction.
            end_year (int):             The ending year for the reduction.
        
        Returns:
            GeoDataFrame:               The Reduced labels DataFrame.
        """
        common_df = labels_df[(labels_df['start_year'] <= self.start_year) & (labels_df['end_year'] >= self.end_year)]
        uncommon_df = labels_df[(labels_df['start_year'] > self.start_year) | (labels_df['end_year'] < self.end_year)]
        return common_df.reset_index(drop=True), uncommon_df.reset_index(drop=True)  


    @staticmethod
    def __split_year__(year, label_df):
        """
            Returns a DataFrame of where any label that overlaps a given year is returned.
        """

        year_df = label_df[(label_df['start_year'] <= year) & (label_df['end_year'] >= year)]
        # year_df['year'] = year
        return year_df.reset_index(drop=True)

    def __build_all_geometries__(self):
        """
            Builds a all Geometries DataFrame from the data labels
        """
        uniques = pd.DataFrame(columns=[self.index_column, '.geo'])
        for i in self.data_dir.index:
            df_dir = self.data_dir.loc[i, 'dir']
            # print(f"Loading Geometries {df_dir}...")

            df = pd.read_csv(df_dir)
            df = df[[self.index_column, '.geo']]
            print(f"Loaded {len(df)} geometries from {df_dir}.")

            df = df.dropna(subset=[self.index_column, '.geo'])
            df = df.reset_index(drop=True)
            
            uniques = pd.concat([uniques, df], ignore_index=True)
            uniques = uniques.drop_duplicates(subset=[self.index_column])

            # no need to check all if we know the data IS FULL!
            if self.full_data:
                return uniques
            
        return uniques

    def __build_common_labels__(self):
        """
            Builds a common cluster from the common labels DataFrame. That ALL years share to avoid redundant processing.
        """

        if self.common_labels.empty:
            return pd.DataFrame(columns=["file_name", "label"]) # empty DataFrame 

        uniuqes_gdf = self.__build_all_geometries__()
        uniuqes_gdf['geometry']  = uniuqes_gdf['.geo'].apply(lambda x: shape(json.loads(x)))
        uniuqes_gdf = gpd.GeoDataFrame(uniuqes_gdf, geometry='geometry', crs="EPSG:4326")

        cl_cluster = cl.Cluster(subregions=subregions, df_data=None, mapping=None, index_column=self.index_column)
        cl_cluster.data = uniuqes_gdf.set_index("file_name")
        # cl_cluster.points = common_df.drop(columns=['start_year', 'end_year'])    
        cl_cluster.points = self.common_labels.drop(columns=['start_year', 'end_year'])

        cl_cluster.build_row_labels()

        # Select only the necessary columns
        base_labels = cl_cluster.labels.copy()
        base_labels = base_labels.reset_index(drop=False)
        self.base_labels = pd.DataFrame(base_labels, columns=['file_name', 'label'])
        return self.base_labels

    def instansiate_clusters(self, cluster_class, subregions, aoi=None, index_column="file_name"):
        """
            Instantiates clusters for each year in the data directory.
        """
        
        base_labelles = self.__build_common_labels__()
        
        cluster_list = []

        for year in range(self.start_year, self.end_year + 1):
            print(f"Instantiating cluster for year {year}...")
            data = pd.read_csv(self.data_dir.loc[data_dir['year'] == year, 'dir'].values[0])

            # (self, subregions, df_data, mapping, passes=6, aoi=None, index_column="file_name", points=gpd.GeoDataFrame()):
            cl_cluster = cluster_class(subregions=subregions, 
                                          df_data=data, 
                                          mapping=self.mapping, 
                                          points=self.common_labels,
                                          aoi=aoi,
                                          index_column=index_column,
                                          passes=self.passes,
                                          supress_warnings=True) # ONLY warnings that exist at the current time are errors that should only occur outside of this
            
            # load in the common shared labels 
            df_copy = base_labelles.copy()
            cl_cluster.load_labels_df(df=df_copy)
            
            # update labels with the non-shared labels 
            uncommon_year_df = self.yearly_labels_dict[year]
            uncommon_year_df = uncommon_year_df.drop(columns=['start_year', 'end_year'])
            print(f"TRYING row labels for year {year} with {len(uncommon_year_df)} uncommon labels. columns: {uncommon_year_df.columns.tolist()}")
            cl_cluster.build_row_labels(self.index_column, additional_labels=uncommon_year_df, update=True)

            cluster_list.append(cl_cluster)

        self.cluster_list = cluster_list


    def fit(self, save_state=""):
        """
            Fits all clusters in the cluster list.

            Args:
                save_state (str):         The prefix for the filename to save the state of the clusters. If empty, no state is saved.
        """

        for i, cl_cluster in enumerate(self.cluster_list):
            print(f"Fittingitted cluster for year {self.start_year + i}")

            cl_cluster.fit()

        if save_state is not "":
            self.save_states(filename_prefix=save_state)

    def update_labels(self, labels_df):
        """
            Updates the labels of all clusters in the cluster list with the provided labels DataFrame.

            Args:
                labels_df (GeoDataFrame):  The DataFrame containing the new labels to update.
        """

        for i, cl_cluster in enumerate(self.cluster_list):
            print(f"Updating labels for cluster {self.start_year + i}")
            cl_cluster.update_row_labels(labels_gdf=labels_df, label_row=self.index_column)
                
    def create_map(self, filename_prefix="/cluster/clusters", formats=["tif"]):
        """
            Predicts the labels for all clusters in the cluster list.

            Args:
                save_state (str):         The prefix for the filename to save the state of the clusters. If empty, no state is saved.
        """

        for i, cl_cluster in enumerate(self.cluster_list):
            print(f"Predicting / Creating Map's based on Clusters for year {self.start_year + i}")

            if filename_prefix is not "":
                if "tif" in formats or ".tif" in formats:
                    cl_cluster.create_map(filename=f"{filename_prefix}_{self.start_year + i}.tif")
            else: 
                raise ValueError("filename_prefix must be a non-empty string to save the state of the clusters. e.g. 'cluster_' to create 'cluster_2024.tif'.")
            
    def build_recomendation(self, filename_prefix="clusters"):
        """Builds recommendations for all clusters"""

        for i, cl_cluster in enumerate(self.cluster_list):
            print(f"Building recommendations for cluster {self.start_year + i} with {len(cl_cluster.labels)} tiles.")

            # build the recommendations
            if cl_cluster.predictions.empty:
                print("Creating predictions for cluster first...")
                cl_cluster.create_predictions() # MUST be done first else will not work 
            cl_cluster.create_recommendations(export_filename=f"/{filename_prefix}_{self.start_year + i}_recomendation.tif")


    def save_states(self, filename_prefix="clusters/cluster_"):
        """
            Saves the state of all clusters in the cluster list to files.
        """
        
        for i, cl_cluster in enumerate(self.cluster_list):
            print(f"Saving state of cluster {i + 2019} with {len(cl_cluster.labels)} labels.") # TODO: replace 2019 with self.start_year
            cl_cluster.save_state(filname_prefix=filename_prefix, filename_postfix=f"{self.start_year + i}_state")


    def load_states(self, cluster_class=cl.Cluster, filename_prefix="clusters/cluster", aoi=None):
        """
            Reloads clusters from saved states. 
        """

        cluster_list = []

        for year in range(self.start_year, self.end_year + 1):
            print(f"Instantiating cluster for year {year}...")
            data = pd.read_csv(self.data_dir.loc[data_dir['year'] == year, 'dir'].values[0])

            # (self, subregions, df_data, mapping, passes=6, aoi=None, index_column="file_name", points=gpd.GeoDataFrame()):
            cl_cluster = cluster_class(subregions=subregions, 
                                            df_data=data, 
                                            mapping=self.mapping, 
                                            points=self.common_labels,
                                            aoi=aoi,
                                            index_column=self.index_column,
                                            passes=self.passes,
                                            supress_warnings=True) # ONLY warnings that exist at the current time are errors that should only occur outside of this
            
            # reload the data from the state files 
            cl_cluster.reload_state(filname_prefix=filename_prefix, filename_postfix=f"{year}_state")

            cluster_list.append(cl_cluster)

        self.cluster_list = cluster_list


In [None]:
raise ValueError("BREAK POINT")

# FITTING 581 

In [17]:
import shapely.wkt as wkt

labels = pd.read_csv("labels_backup_581.csv")
labels['geometry'] = labels['geometry'].apply(wkt.loads)
labels = gpd.GeoDataFrame(labels, geometry='geometry')
labels = labels.set_crs("EPSG:4326", allow_override=True)
labels

Unnamed: 0,label,geometry,start_year,end_year
0,Urban,POINT (43.85488 34.18178),2019,2025
1,Urban,POINT (43.85759 34.18650),2019,2025
2,Urban,POINT (43.86857 34.18203),2019,2025
3,Urban,POINT (43.87488 34.18054),2019,2025
4,Urban,POINT (43.87346 34.18693),2019,2025
...,...,...,...,...
576,Agricultural,POINT (43.74195 34.06483),2019,2025
577,Agricultural,POINT (43.74487 34.13285),2019,2025
578,Wasteland,POINT (43.66342 34.14457),2019,2025
579,Agricultural,POINT (43.67689 34.15274),2019,2025


In [18]:
cluster_ts = ts_cluster(ts_point_labels=labels, data_dir=data_dir, mapping=mapping, start_year=2019, end_year=2024, passes=3)
cluster_ts.instansiate_clusters(cluster_class=cl.Cluster, subregions=subregions, aoi=aoi, index_column="file_name")

Loaded 375600 geometries from data/2019.csv.
Error: 'NoneType' object has no attribute 'set_index', this may affect and break processing. Please check your input DataFrame columns and index_column.
Labels after assignment: 549 total with labels, 375051 without labels.
Instantiating cluster for year 2019...
TRYING row labels for year 2019 with 10 uncommon labels. columns: ['label', 'geometry']
Additional labels provided, merging with existing labels.
Labels after assignment: 559 total with labels, 375041 without labels.
Instantiating cluster for year 2020...
TRYING row labels for year 2020 with 13 uncommon labels. columns: ['label', 'geometry']
Additional labels provided, merging with existing labels.
Labels after assignment: 561 total with labels, 375039 without labels.
Instantiating cluster for year 2021...
TRYING row labels for year 2021 with 13 uncommon labels. columns: ['label', 'geometry']
Additional labels provided, merging with existing labels.
Labels after assignment: 561 total

In [19]:
cluster_ts.fit(save_state="product/gen_cluster581/cluster_")

Fittingitted cluster for year 2019
Fittingitted cluster for year 2020
Fittingitted cluster for year 2021
Fittingitted cluster for year 2022
Fittingitted cluster for year 2023
Fittingitted cluster for year 2024
No data points found in this subregion.
No data points found in this subregion.
No data points found in this subregion.
Saving state of cluster 2019 with 375600 labels.
Saving state of cluster 2020 with 375600 labels.
Saving state of cluster 2021 with 375600 labels.
Saving state of cluster 2022 with 375599 labels.
Saving state of cluster 2023 with 375600 labels.
Saving state of cluster 2024 with 312398 labels.


In [20]:
# create maps 
cluster_ts.create_map(filename_prefix="product/gen_cluster581/cluster_")

Predicting / Creating Map's based on Clusters for year 2019
Predicting / Creating Map's based on Clusters for year 2020
Predicting / Creating Map's based on Clusters for year 2021
Predicting / Creating Map's based on Clusters for year 2022
Predicting / Creating Map's based on Clusters for year 2023
Predicting / Creating Map's based on Clusters for year 2024


In [21]:
# create recommendations
cluster_ts.build_recomendation(filename_prefix="product/gen_cluster581/cluster_recomendation_")

Building recommendations for cluster 2019 with 375600 tiles.
Created directory: /product/gen_cluster581
Building recommendations for cluster 2020 with 375600 tiles.
Building recommendations for cluster 2021 with 375600 tiles.
Building recommendations for cluster 2022 with 375599 tiles.
Building recommendations for cluster 2023 with 375600 tiles.
Building recommendations for cluster 2024 with 312398 tiles.


In [None]:
cluster_ts.update_labels(labels)

In [None]:
def create_recommendations(cluster, export_filename="recommendation.tif", aim=10, aim_weight=0.25, aim_max_multiplier=10.0, class_proportion_weight=0.75):
    """
        Based on the Points Creates a map of recommendations for labelling the points in the subregions. 

        This generates a 0.0 to 1.0 recommendation for where the next point should be placed, where 1.0 is the highest recommendation and 0.0 is the lowest recommendation.

        Args:
            aim (int):                          The total number of classes per tile to minimally aim for.
            aim_weight (float):                 Weight in final recommendation based on the aim.
            max_multiplier (float):             Multiplier for the aim weight 
            class_proportion_weight (float):    Weight in final recommendation based on the label proportions (How the classes are distributed within clusters).
    """

    # Data
    sparse_matrix = cluster.sparse_matrix.drop(columns=['.geo']).copy()
    labels_df = cluster.predictions.drop(columns=['.geo']).copy()

    # summary of NaNs (TOTAL) and multipler creation 
    total_nans = sparse_matrix.isna().sum(axis=1)
    total_non_nans = sparse_matrix.shape[1] - total_nans

    # summary stat for recommendation 
    recommendation_to_aim = 1.0 + (aim_max_multiplier - 1.0) * np.clip((aim - total_non_nans) / aim, 0.0, 1.0)

    # Review labels within each cluster run and their distribution
    for col in sparse_matrix.columns:
        clusters = sparse_matrix[col]
        labels = labels_df['predicted_label']

        # generate proportions of the clusters and take max percentage
        proportions = cluster.cluster_label_proportions(clusters, labels)
        max_row_proportions = proportions.max(axis=1)

        # replace based on index in max_row_proportions in col
        sparse_matrix[col] = 1 - sparse_matrix[col].replace(max_row_proportions.index, max_row_proportions.values)
        # sparse_matrix[col] = 1 - sparse_matrix[col]

        # sum rows with NaN = 1.0 max possible value
        # sparse_matrix[col] = sparse_matrix[col].fillna(0.0)

    # new column for recommendations which sum the rows 'recommendation'
    sparse_matrix['recommendation'] = sparse_matrix.mean(axis=1)

    # apply mulipler and weight the recommendation (CAP TO 1.0)
    # sparse_matrix['recommendation'] = sparse_matrix['recommendation'] * (multiplier*aim_weight) + sparse_matrix['recommendation'] * class_proportion_weight
    sparse_matrix['recommendation'] = sparse_matrix['recommendation'] * (recommendation_to_aim * aim_weight) + sparse_matrix['recommendation'] * class_proportion_weight
    sparse_matrix['recommendation'] = np.minimum(sparse_matrix['recommendation'], 1.0) 
    
    # reattach geometry
    sparse_matrix['.geo'] = cluster.sparse_matrix['.geo']

    cluster.export_to_tif(sparse_matrix, bands=['recommendation'], output_dir=export_filename, res=50, UTM_ESPG=cluster.UTM_ESPG, EPSG=cluster.EPSG)

# Create Recommendations
cluster_1 = cluster_ts.cluster_list[0]
create_recommendations(cluster_1, export_filename="product/_test_to_del/recommendation_2019_max_final_test_point5each.tif", aim=10, aim_weight=0.5, aim_max_multiplier=10.0, class_proportion_weight=0.5)

In [26]:
cluster_ts.cluster_list[0].predictions

Unnamed: 0_level_0,predicted_label,.geo,numeric_label
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tile_43637469_34558614_43638574_34559528,Urban,"{""type"":""Polygon"",""coordinates"":[[[43.63748527...",4
tile_43637484_34557712_43638589_34558626,,"{""type"":""Polygon"",""coordinates"":[[[43.63749867...",0
tile_43637499_34556811_43638603_34557725,Wasteland,"{""type"":""Polygon"",""coordinates"":[[[43.63751207...",5
tile_43637514_34555909_43638618_34556823,Wasteland,"{""type"":""Polygon"",""coordinates"":[[[43.63752986...",5
tile_43637528_34555008_43638633_34555922,Wasteland,"{""type"":""Polygon"",""coordinates"":[[[43.63754326...",5
...,...,...,...
tile_44187802_33891657_44188892_33892565,Agricultural,"{""type"":""Polygon"",""coordinates"":[[[44.18780074...",3
tile_44187810_33890755_44188900_33891664,Agricultural,"{""type"":""Polygon"",""coordinates"":[[[44.18780972...",3
tile_44187819_33889853_44188909_33890762,Agricultural,"{""type"":""Polygon"",""coordinates"":[[[44.18781862...",3
tile_44187827_33888951_44188917_33889860,Agricultural,"{""type"":""Polygon"",""coordinates"":[[[44.18782753...",3


In [None]:
cluster_ts.cluster_list[0].labels['label'].unique()

In [None]:
# 2019 
predictions = pd.read_csv("cluster581/cluster581_2019_predictions.csv")
cluster_ts.cluster_list[0].predictions = predictions
# cluster_ts.cluster_list[0].create_recommendations(export_filename="cluster581/cluster581_2019_recommendations.tif")
# cluster_ts.cluster_list[0].predictions = predictions

In [None]:
cluster_ts.cluster_list[0].create_recommendations(export_filename="cluster581/cluster581_2019_recommendations_classbased.tif", aim=10, aim_weight=0.0, aim_max_multiplier=2.0, class_proportion_weight=1.0) # aim=10, aim_weight=0.25, aim_max_multiplier=2.0, class_proportion_weight=0.75


In [None]:
raise ValueError("BREAK POINT")

In [None]:
predictions

In [None]:
cluster_ts.create_map(filename_prefix="cluster581_notestver/cluster_", formats=["tif"])

In [None]:
# cluster_ts.fit(save_state="cluster_test_10_")
# cluster_ts.fit(save_state="cluster581/cluster_test2fit_")

In [None]:
# cluster_ts.cluster_list[5].create_map(filename="/cluster581/cluster_test2fit_2024.tif")

In [None]:
cluster_ts.cluster_list[5].sparse_matrix = cluster_ts.cluster_list[5].sparse_matrix [~cluster_ts.cluster_list[5].sparse_matrix .index.duplicated(keep='first')]
cluster_ts.cluster_list[5].sparse_matrix

In [None]:
# drop duplicateds based on index as some mighbt be the same
sparse = sparse[~sparse.index.duplicated(keep='first')]
sparse

In [None]:
cluster_ts.cluster_list[5].predictions.to_csv("cluster581/cluster581_2024_predictions.csv")

In [None]:
cluster_ts.cluster_list[5].create_map(filename_prefix="cluster581/cluster_test2fit_2024", formats=["tif"])

In [None]:
cluster_ts.create_map(filename_prefix="cluster581/cluster_test2fit_", formats=["tif"])

In [None]:
cluster_ts.build_recomendation(filename_prefix="cluster581/cluster581_recomendation_test2fit_")