# Overview

This notebooks demonstrates how to perform pin drop sampling in the Philippines using barangays as PSUs. 

In [1]:
%load_ext autoreload
%autoreload 2

In [38]:
import geopandas as gpd
from pathlib import Path
import pandas as pd
from datetime import datetime
import folium
from tqdm import tqdm
tqdm.pandas()
from pin_drop_sampling2.utils import get_s2_cell_id, count_neighbors_in_radius, get_nearest_point_on_road, dist_in_meters

# Set file locations
Set the location of the file with the PSU boundaries and population counts and the directory for the rooftop data files below.

In [48]:
DB_DIR = Path.home() / 'IDinsight Dropbox' / 'Random Walk Testing' 
PSU_FILE = DB_DIR / '01_Raw data'/ '03_Census' / 'Philippines' / 'barangay_w_borders.parquet'
ROOFTOP_DIR = DB_DIR /'01_Raw data'/ '01_Rooftop'/'Philippines'
OUTPUT_DIR = DB_DIR / '03_Output' / '05_HPLS qual'

timestamp = datetime.now().strftime("%Y%m%d_%H")

# Sample PSUs

The code below samples num_brgs_per barangays in each PSU using PPS sampling without replacement.

In [5]:
psus = gpd.read_parquet(PSU_FILE)
psus.head()

num_brgys_per = 3

def pps_sample(group):
    # Normalize the weights for the group
    probabilities = group['brgy_pop'] / group['brgy_pop'].sum()
    
    num_to_sample = min(num_brgys_per, group.shape[0])

    # Sample without replacement using the normalized weights
    sampled_group = group.sample(n=num_to_sample, weights=probabilities, replace=False)
    return sampled_group

sampled_barangays = psus.groupby('prov_code', group_keys=False).apply(pps_sample)
# save the sampled barangays
sampled_barangays.to_parquet(OUTPUT_DIR / f'samp_bars_{timestamp}.parquet')

  sampled_barangays = psus.groupby('prov_code', group_keys=False).apply(pps_sample)


In [12]:
barangays_missing_geometry = sampled_barangays[sampled_barangays['geometry'].isna()]
print(f'There are {len(barangays_missing_geometry)} barangays with missing geometry')

# drop barangays with missing geometry
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
sampled_barangays = sampled_barangays.dropna(subset=['geometry'])

There are 0 barangays with missing geometry


# Generate dataset of rooftops in sampled barangays
The code below filters the rooftop data and generates a single dataset with only the rooftops within the sampled barangays. Note that a) this can take quite a bit of time and b) if any barangays happen to straddle more than one s2 cell only a portion of rooftops will be included. 

In [None]:
# get the s2 cell id for each barangay
sampled_barangays['s2_cell_id'] = sampled_barangays.apply(lambda x: get_s2_cell_id(x.geometry.centroid, 4), axis=1)

# create empty gdf to store rooftops
all_rooftops = gpd.GeoDataFrame()

# loop over each unique value of s2_cell_id. I loop over s2_cell_id first because loading the rooftop data for each s2 cell 
# takes a lot of time so I want to do it only once for each s2 cell
pd.options.mode.chained_assignment = None  # turn off annoying copy of a df warning
for s2_cell_id in sampled_barangays['s2_cell_id'].unique():
    print(f"\nProcessing s2 cell {s2_cell_id}")
    # get the barangays in this s2 cell
    barangays_in_s2_cell= sampled_barangays[sampled_barangays['s2_cell_id'] == s2_cell_id]
    # load the rooftop data for this s2 cell
    rooftops_gdf = gpd.read_parquet(ROOFTOP_DIR /f'{s2_cell_id}.parquet')
    # replace geometry column with the centroid of the geometry
    rooftops_gdf['geometry'] = rooftops_gdf.geometry.centroid

    for item, row in barangays_in_s2_cell.iterrows():
        # print a single dot without the newline character
        print('.', end='')
        # filter rooftops to only include those within the barangay
        temp_rooftops = rooftops_gdf[rooftops_gdf.geometry.within(row.geometry)]
        # set the psid for the rooftops
        temp_rooftops['PSGC'] = row['PSGC']
        temp_rooftops.to_crs(epsg=4326, inplace=True)
        # try to append temp_rooftops to all_rooftops and catch a value error. if there is an error, print the crs of the two dataframes
        try:
            all_rooftops = gpd.GeoDataFrame(pd.concat([all_rooftops, temp_rooftops], ignore_index=True))
        except ValueError:
            print(f"Error with s2 cell {s2_cell_id}")
            print(f"temp_rooftops crs: {temp_rooftops.crs}")
            print(f"all_rooftops crs: {all_rooftops.crs}")
            
# save all rooftops to a parquet file in case I close this notebook or the kernel gets messed up
all_rooftops.to_parquet(OUTPUT_DIR / f'all_roofs_samp_bars_{timestamp}.parquet')

# [Optional] Load rooftops in sampled barangays
The code above is pretty time intensive. The line below loads in a previously created sample of rooftops. This may be useful if, for example, you don't have time to run all the steps in the notebook at once.

In [4]:
all_rooftops = gpd.read_parquet(OUTPUT_DIR / 'samp_roofs_20241029_142812.parquet')

# Identify isolated rooftops
We identify and filter out isolated rooftops with no other rooftops around. These rooftops may not have people living in or near them and could result in very high travel costs.

In [5]:
# identify isolated points to drop from sampling
all_rooftops['neighbors'] = count_neighbors_in_radius(all_rooftops)
all_rooftops['isolated'] = (all_rooftops['neighbors'] < 5)

In [6]:
# remove isolated points from the rooftop data
# all_rooftops_wo_isolated = all_rooftops[~all_rooftops['isolated']]
all_rooftops_wo_isolated = all_rooftops

# Sample 4 rooftops from each barangay

In [7]:
# sample up to 4 rooftops per barangay (or all if there are less than 4)
sampled_points = all_rooftops_wo_isolated.groupby('PSGC', group_keys=False).apply(lambda x: x.sample(n=min(4, x.shape[0])))

  sampled_points = all_rooftops_wo_isolated.groupby('PSGC', group_keys=False).apply(lambda x: x.sample(n=min(4, x.shape[0])))


# Get nearest points on road 

In [44]:
# for each point in the sampled points, get the nearest point on the road
sampled_points['nearest_point_on_road'] = sampled_points.progress_apply(lambda x: get_nearest_point_on_road(x.geometry), axis=1)

100%|██████████| 1366/1366 [03:13<00:00,  7.06it/s]


In [45]:
sampled_points['distance_to_road_m'] = sampled_points.apply(lambda x: dist_in_meters(x.geometry, x.nearest_point_on_road), axis=1)

# Save sample outputs

In [68]:
temp_samp_points = sampled_points[['PSGC', 'geometry', 'nearest_point_on_road', 'distance_to_road_m']]
temp_samp_points['orig_lat'] = temp_samp_points['geometry'].y
temp_samp_points['orig_lon'] = temp_samp_points['geometry'].x
temp_samp_points['road_lat'] = temp_samp_points['nearest_point_on_road'].y
temp_samp_points['road_lon'] = temp_samp_points['nearest_point_on_road'].x
temp_samp_points = temp_samp_points.drop(columns=['geometry', 'nearest_point_on_road'])
temp_samp_points['google_maps_directions_link'] = temp_samp_points.apply(
    lambda row: f"https://www.google.com/maps/dir/{row['orig_lat']},{row['orig_lon']}/{row['road_lat']},{row['road_lon']}",
    axis=1
)
# replace google_maps_directions_link with a link to just the original point if nearest_point_on_road is null
temp_samp_points.loc[temp_samp_points['road_lat'].isna(), 'google_maps_directions_link'] = temp_samp_points.loc[
    temp_samp_points['road_lat'].isna(), ['orig_lat', 'orig_lon']
].apply(lambda x: f"https://www.google.com/maps?q={x[0]},{x[1]}", axis=1)


# save the sampled points as csv
temp_samp_points.to_excel(OUTPUT_DIR / f'samp_points_{timestamp}.xlsx', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 