In [1]:
import pandas as pd
import geopandas as gpd
# from sklearn.cluster import DBSCAN
import os
import glob
import numpy as np
from concurrent.futures import ProcessPoolExecutor

# Parallel Processing

In [2]:
from sklearn.cluster import DBSCAN

def segment_points(gdf, distance_threshold):
    # Convert the GeoDataFrame to a numpy array
    points = np.array([[geom.y, geom.x] for geom in gdf.geometry])

    # Initialize DBSCAN with the distance threshold
    dbscan = DBSCAN(eps=distance_threshold, min_samples=1, algorithm='ball_tree', metric='haversine')

    # Fit the DBSCAN model to the points
    clusters = dbscan.fit_predict(points)

    # Create a new GeoDataFrame with the clustered points
    gdf['cluster'] = clusters
    return gdf['cluster']

def distribute_clusters_into_groups(gdf, n_groups):
    """
    Distributes clusters into groups, ensuring each cluster is entirely contained within
    a single group and aiming for a similar number of rows in each group.
    
    Parameters:
    - gdf: GeoDataFrame that includes a 'cluster' column.
    - n_groups: The number of groups to distribute the clusters into.
    
    Returns:
    - A copy of the gdf with an additional column 'group' indicating the group assignment.
    """
    
    # Calculate cluster sizes
    cluster_sizes = gdf['cluster'].value_counts().reset_index()
    cluster_sizes.columns = ['cluster', 'size']
    
    # Sort clusters by size in descending order
    cluster_sizes = cluster_sizes.sort_values(by='size', ascending=False)
    
    # Initialize groups
    groups = {i: [] for i in range(n_groups)}  # Dictionary to hold cluster IDs for each group
    group_sizes = {i: 0 for i in range(n_groups)}  # Dictionary to track the total size of each group
    
    # Distribute clusters into groups
    for _, row in cluster_sizes.iterrows():
        cluster_id, size = row['cluster'], row['size']
        
        # Find the group with the minimum size
        min_group = min(group_sizes, key=group_sizes.get)
        
        # Assign the cluster to this group
        groups[min_group].append(cluster_id)
        group_sizes[min_group] += size
    
    # Map clusters to their assigned group
    cluster_to_group = {cluster: group for group, clusters in groups.items() for cluster in clusters}
    gdf['group'] = gdf['cluster'].map(cluster_to_group)
    
    return gdf

def fetch_most_recent_results(target_folder, prefix='*'):
    search_pattern = os.path.join(target_folder, f"{prefix}_*.pkl")

    # Find all files matching the pattern
    files = glob.glob(search_pattern)

    if len(files) == 0:
        return 'None'

    # Extract numbers from the filenames and find the max
    max_file = None
    max_num = -1
    for file in files:
        try:
            # Extract the number from the filename
            num = int(os.path.basename(file).split('_')[-1].split('.')[0])
            if num > max_num:
                max_num = num
                max_file = file
        except ValueError:
            # Skip files where the number cannot be parsed
            continue

    return max_file

# Actual Runs

In [3]:
selected = pd.read_csv('./assets/examples/sample_curitiba.csv')
selected['geometry'] = gpd.points_from_xy(x=selected['LONGITUDE'], y=selected['LATITUDE'])
selected = selected.iloc[:3]

In [5]:
selected['cluster'] = segment_points(selected, 0.00009*20)

num_groups = 3
grouped = distribute_clusters_into_groups(selected, num_groups)
location_distribution = grouped['group'].value_counts().values.tolist()

print(f"Location distribution between groups {location_distribution}")
if len(location_distribution) < num_groups:
    num_groups = len(location_distribution)

os.makedirs('./tmp_inputs', exist_ok=True)

for group in grouped['group'].unique():
    subset = grouped[grouped['group'] == group]
    subset = subset.head(1000)
    
    subset.drop(columns='geometry').to_csv(f'./tmp_inputs/group_{str(group)}.csv')

def run_script(file_path, target_path, world_path):
    import subprocess
    subprocess.run(["python", "main.py", file_path, target_path, world_path], check=True)

target_path = './world_models/sample_curitiba'
target_paths = [target_path] * num_groups
world_path = [fetch_most_recent_results(target_path, 'consolidated')] * num_groups
print(f'spinning up {num_groups} processess')

# Run the script in parallel for each file
with ProcessPoolExecutor() as executor:
    file_paths = [f'./tmp_inputs/group_{str(group)}.csv' for group in range(num_groups)]
    executor.map(run_script, file_paths, target_paths, world_path)

Location distribution between groups [3]
spinning up 1 processess
Params passed to Resize transform:
	width:  640
	height:  640
	resize_target:  True
	keep_aspect_ratio:  False
	ensure_multiple_of:  14
	resize_method:  minimal
Using pretrained resource local::./checkpoints/depth_anything_metric_depth_outdoor.pt
Loaded successfully


In [38]:
from core.world import World
import copy
import os

def get_latest_group_files(num_groups, folder_name):
	"""
	Return a list of paths for the highest version number file for each group within the specified folder.

	Parameters:
		num_groups (int): Number of groups to process.
		folder_name (str): Folder containing the group files.

	Returns:
		list of str: Paths to the files with the highest version number for each group.
	"""
	# Dictionary to keep track of the highest version file for each group
	latest_files = {}

	# List all files in the given folder
	for file in os.listdir(folder_name):
		if file.startswith("group_") and file.endswith(".pkl"):
			# Extract group number and version number from the filename
			_, group_number, version = file.split("_")
			version = version.split(".")[0]  # Remove the file extension

			# Convert extracted values to integers
			group_number = int(group_number)
			version = int(version)

			# Update the dictionary if this is the highest version for the group
			if group_number not in latest_files or version > latest_files[group_number][1]:
				latest_files[group_number] = (file, version)

	# Generate the final list of file paths, sorted by group number
	sorted_files = [os.path.join(folder_name, latest_files[group_number][0]) 
					for group_number in sorted(latest_files) 
					if group_number < num_groups]

	return sorted_files

folder_name = f'{target_path}/intermediary_savestates'
num_groups = 3
files = get_latest_group_files(num_groups, folder_name)
print(files)

savestates = []
gdfs = []
buffered_gdfs = []
locations = []
tmp_world = World()

for file in files:
    tmp_world.loadstate(file)
    savestates.append(copy.deepcopy(tmp_world))

for world in savestates:
    gdfs.append(world.gdf)
    buffered_gdfs.append(world.buffered_geometries)
    locations.append(world.locations)

final_gdf = pd.concat(gdfs)
final_buffered = pd.concat(buffered_gdfs)
final_locations = [item for sublist in locations for item in sublist]

final_world = World()
final_world.gdf = final_gdf
final_world.buffered_geometries = final_buffered
# final_world.locations = final_locations

for location in final_locations:
	final_world.add_Location360(location.lat, location.lon, obj=location)

final_world.buffer_distance = tmp_world.buffer_distance

visited = [index for index, location in enumerate(final_world.locations) if location.walked == True]
final_world.savestate(f'{target_path}/consolidated_{len(visited)}.pkl')

['./world_models/backbone_v2/intermediary_savestates/group_0_65.pkl', './world_models/backbone_v2/intermediary_savestates/group_1_65.pkl', './world_models/backbone_v2/intermediary_savestates/group_2_64.pkl']


# Load Model

In [6]:
from core.world import World
world = World()
world.loadstate(f'{target_path}/consolidated_1997.pkl')
world

<core.world.World at 0x7f31d5797690>

In [None]:
world_config = {'version': 'v1',
 'config': {'visState': {'filters': [{'dataId': ['objects'],
     'id': 'o3yn4nfz',
     'name': ['target'],
     'type': 'select',
     'value': True,
     'enlarged': False,
     'plotType': 'histogram',
     'animationWindow': 'free',
     'yAxis': None,
     'speed': 1},
    {'dataId': ['objects'],
     'id': 'ieifk7u9d',
     'name': ['image_type'],
     'type': 'multiSelect',
     'value': ['final_step'],
     'enlarged': False,
     'plotType': 'histogram',
     'animationWindow': 'free',
     'yAxis': None,
     'speed': 1}],
   'layers': [{'id': 'oyar70g',
     'type': 'geojson',
     'config': {'dataId': 'objects',
      'label': 'objects',
      'color': [41, 76, 181],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'geojson': 'geometry'},
      'isVisible': True,
      'visConfig': {'opacity': 0.5,
       'strokeOpacity': 0.8,
       'thickness': 2.5,
       'strokeColor': None,
       'colorRange': {'name': 'ColorBrewer Dark2-7',
        'type': 'qualitative',
        'category': 'ColorBrewer',
        'colors': ['#1b9e77',
         '#d95f02',
         '#7570b3',
         '#e7298a',
         '#66a61e',
         '#e6ab02',
         '#a6761d']},
       'strokeColorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'radius': 10,
       'sizeRange': [0, 10],
       'radiusRange': [0, 50],
       'heightRange': [0, 500],
       'elevationScale': 5,
       'enableElevationZoomFactor': True,
       'stroked': False,
       'filled': True,
       'enable3d': False,
       'wireframe': False},
      'hidden': False,
      'textLabel': [{'field': None,
        'color': [255, 255, 255],
        'size': 18,
        'offset': [0, 0],
        'anchor': 'start',
        'alignment': 'center'}]},
     'visualChannels': {'colorField': {'name': 'obj_type', 'type': 'integer'},
      'colorScale': 'quantize',
      'strokeColorField': None,
      'strokeColorScale': 'quantile',
      'sizeField': None,
      'sizeScale': 'linear',
      'heightField': None,
      'heightScale': 'linear',
      'radiusField': None,
      'radiusScale': 'linear'}},
    {'id': 'ittjroo',
     'type': 'geojson',
     'config': {'dataId': 'Postes Vtal',
      'label': 'Postes Vtal',
      'color': [210, 0, 0],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'geojson': 'geometry'},
      'isVisible': True,
      'visConfig': {'opacity': 0.5,
       'strokeOpacity': 0.8,
       'thickness': 0.5,
       'strokeColor': None,
       'colorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'strokeColorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'radius': 10,
       'sizeRange': [0, 10],
       'radiusRange': [0, 50],
       'heightRange': [0, 500],
       'elevationScale': 5,
       'enableElevationZoomFactor': True,
       'stroked': False,
       'filled': True,
       'enable3d': False,
       'wireframe': False},
      'hidden': False,
      'textLabel': [{'field': None,
        'color': [255, 255, 255],
        'size': 18,
        'offset': [0, 0],
        'anchor': 'start',
        'alignment': 'center'}]},
     'visualChannels': {'colorField': None,
      'colorScale': 'quantile',
      'strokeColorField': None,
      'strokeColorScale': 'quantile',
      'sizeField': None,
      'sizeScale': 'linear',
      'heightField': None,
      'heightScale': 'linear',
      'radiusField': None,
      'radiusScale': 'linear'}}],
   'interactionConfig': {'tooltip': {'fieldsToShow': {'objects': [{'name': 'id',
        'format': None},
       {'name': 'distance', 'format': None},
       {'name': 'image_type', 'format': None},
       {'name': 'target', 'format': None}],
      'Postes Vtal': [{'name': 'Name', 'format': None},
       {'name': 'descriptio', 'format': None},
       {'name': 'timestamp', 'format': None},
       {'name': 'begin', 'format': None},
       {'name': 'end', 'format': None}],
      'Starters': [{'name': 'LATITUDE', 'format': None},
       {'name': 'LONGITUDE', 'format': None}]},
     'compareMode': False,
     'compareType': 'absolute',
     'enabled': True},
    'brush': {'size': 0.5, 'enabled': False},
    'geocoder': {'enabled': False},
    'coordinate': {'enabled': False}},
   'layerBlending': 'normal',
   'splitMaps': [],
   'animationConfig': {'currentTime': None, 'speed': 1}},
  'mapState': {'bearing': 0,
   'dragRotate': False,
   'latitude': -25.437284343802318,
   'longitude': -49.306400994975846,
   'pitch': 0,
   'zoom': 16.19470706392811,
   'isSplit': False},
  'mapStyle': {'styleType': 'dark',
   'topLayerGroups': {},
   'visibleLayerGroups': {'label': True,
    'road': True,
    'border': False,
    'building': True,
    'water': True,
    'land': True,
    '3d building': False},
   'threeDBuildingColor': [9.665468314072013,
    17.18305478057247,
    31.1442867897876],
   'mapStyles': {}}}}

In [None]:
from keplergl import KeplerGl

def plot_objects(world, map_=None, config=None):
        if not map_:
            map_ = KeplerGl(height=600)

        tmp_df = world.gdf.copy()
        map_.add_data(data=tmp_df.drop(columns=['obj', 'image', 'starter']), name='objects')

        if config:
            map_.config = config

        return map_

world_map = plot_objects(world, config=world_config) # config=world_config
world_map