In [1]:
import numpy as np
import pandas as pd

This notebook will serve as an example to showcase the different flows implemented.

## Find optimal assignment using Haversine distance
This is the most lightweight basic way to generate assignments, as it uses Haversine distance: the shortest distance as the crow flies from one GPS point to another. For a more accurate distance, see the next section.
We will generate a dataset of 100 enumerators and 2000 targets and simulate the optimization.

In [3]:
# Set a random seed for reproducibility
np.random.seed(0)

# Generate 100 enumerators with random latitudes and longitudes
enumerator_data = {
    "enum_id": [f"E{i+1:03d}" for i in range(100)],  # Creates IDs like E001, E002, ...
    "enum_lat": np.random.uniform(-90, 90, 100),  # Random latitudes
    "enum_long": np.random.uniform(-180, 180, 100),  # Random longitudes
}

# Generate 2000 targets with random latitudes and longitudes
target_data = {
    "target_id": [
        f"T{i+1:04d}" for i in range(2000)
    ],  # Creates IDs like T0001, T0002, ...
    "target_lat": np.random.uniform(-90, 90, 2000),  # Random latitudes
    "target_long": np.random.uniform(-180, 180, 2000),  # Random longitudes
}

# Create the DataFrames
df_enum = pd.DataFrame(enumerator_data)
df_target = pd.DataFrame(target_data)

# Display the shape of the created DataFrames to confirm the number of rows
print(f"Enumerators DataFrame shape: {df_enum.shape}")
print(f"Targets DataFrame shape: {df_target.shape}")

Enumerators DataFrame shape: (100, 3)
Targets DataFrame shape: (2000, 3)


In [4]:
from surveyscout.utils import LocationDataset

enum_locations = LocationDataset(df_enum, "enum_id", "enum_lat", "enum_long")
target_locations = LocationDataset(
    df_target, "target_id", "target_lat", "target_long"
)

### Basic min distance flow with haversine

This flow implements the basic min distance model where we specify our parameters and the model will find the optimal results. Here are the parameters of the model:
- min_target: The minimum number of targets each enumerator is required to visit.
- max_target: The maximum number of targets each enumerator is allowed to visit.
- max_cost: The  maximum cost assignable to a surveyor to visit a single target.
- max_total_cost:  The initial maximum total cost assignable to a surveyor

In [5]:
from surveyscout.flows import basic_min_distance_flow

results = basic_min_distance_flow(
    enum_locations=enum_locations,
    target_locations=target_locations,
    min_target=5,
    max_target=30,
    max_distance=10000,
    max_total_distance=100000,
)

Optimal value:  2142243.4940421954


In [6]:
results.head()

Unnamed: 0,target_id,enum_id,cost
0,T0001,E060,1.0
1,T0002,E063,1.0
2,T0003,E042,1.0
3,T0004,E061,1.0
4,T0005,E035,1.0


### Recursive min distance flow with haversine
This flow allows to recursively update parameters  until we reach a solution. 
The parameters are as follow:
- min_target: The minimum number of targets each enumerator is required to visit.
- max_target: The maximum number of targets each enumerator is allowed to visit.
- max_cost: The  maximum cost assignable to a surveyor to visit a single target.
- max_total_cost:  The initial maximum total cost assignable to a surveyor
- max_perc: The initial percentile to determine the maximum surveyor-to-target cost (default is 80).
- param_increment: The value by which the parameter bounds and percentiles are adjusted during the recursion if no solution is found (default is 5).

In [7]:
from surveyscout.flows import recursive_min_distance_flow

results_df, params = recursive_min_distance_flow(
    enum_locations=enum_locations,
    target_locations=target_locations,
    min_target=15,
    max_target=35,
    max_distance=10000,
    max_total_distance=100000,
    param_increment=5,
)

Optimal value:  2244424.350659192


In [8]:
print(params)

{'min_target': 15, 'max_target': 35, 'max_cost': 10000, 'max_total_cost': 100000}


In [9]:
results_df.head()

Unnamed: 0,target_id,enum_id,cost
0,T0001,E047,1.0
1,T0002,E063,1.0
2,T0003,E042,1.0
3,T0004,E031,1.0
4,T0005,E035,1.0


# Map-based Min distance flow

We will generate random enumerators and targets data in the city of Chenna in India

In [5]:
# Set a random seed for reproducibility
np.random.seed(22)
# Define the boundaries of Tamil Nadu (approximate)
min_lat = 12.9190
max_lat = 13.2400
min_lon = 80.1234
max_lon = 80.3220


# Generate 100 enumerators with random latitudes and longitudes
enumerator_data = {
    "enum_id": [f"E{i+1:03d}" for i in range(10)],  # Creates IDs like E001, E002, ...
    "enum_lat": np.random.uniform(min_lat, max_lat, 10), 
    "enum_long": np.random.uniform(min_lon, max_lon, 10), 
}

# Generate 2000 targets with random latitudes and longitudes
target_data = {
    "target_id": [
        f"T{i+1:04d}" for i in range(200)
    ],  # Creates IDs like T0001, T0002, ...
    "target_lat": np.random.uniform(min_lat, max_lat, 200),  
    "target_long": np.random.uniform(min_lon, max_lon, 200), 
}

# Create the DataFrames
df_enum = pd.DataFrame(enumerator_data)
df_target = pd.DataFrame(target_data)

# Display the shape of the created DataFrames to confirm the number of rows
print(f"Enumerators DataFrame shape: {df_enum.shape}")
print(f"Targets DataFrame shape: {df_target.shape}")

Enumerators DataFrame shape: (10, 3)
Targets DataFrame shape: (200, 3)


### Min  distance flow with OSRM
To use OSRM we will specify one more parameter:
- routing: The API used to get the route distance between points. Can be either 'haversine' or 'osrm'

In [6]:
from surveyscout.flows import basic_min_distance_flow

enum_locations = LocationDataset(df_enum, "enum_id", "enum_lat", "enum_long")
target_locations = LocationDataset(
    df_target, "target_id", "target_lat", "target_long"
)
results = basic_min_distance_flow(
    enum_locations=enum_locations,
    target_locations=target_locations,
    min_target=5,
    max_target=100,
    max_distance=1000,
    max_total_distance=10000,
    cost_function="osrm",
)

NameError: name 'basic_min_distance_flow' is not defined

In [31]:
results.head()

Unnamed: 0,target_id,enum_id,value
0,T0001,E010,1.0
1,T0002,E004,1.0
2,T0003,E006,1.0
3,T0004,E002,1.0
4,T0005,E003,1.0


### Recursive min distance Flow with OSRM
To use OSRM we will specify one more parameter:
- routing: The API used to get the route distance between points. Can be either 'haversine' or 'osrm'

In [32]:
from surveyscout.flows import recursive_min_distance_flow

results_df, params = recursive_min_distance_flow(
    enum_locations=enum_locations,
    target_locations=target_locations,
    min_target=15,
    max_target=35,
    max_distance=100,
    max_total_distance=1000,
    param_increment=5,
    routing="osrm",
)

Optimal value:  1336.003


In [33]:
print(params)

{'min_target': 15, 'max_target': 35, 'max_cost': 100, 'max_total_cost': 1000}


In [34]:
results_df.head()

Unnamed: 0,target_id,enum_id,value
0,T0001,E010,1.0
1,T0002,E008,1.0
2,T0003,E006,1.0
3,T0004,E002,1.0
4,T0005,E003,1.0
