# Overview

This notebook samples rooftops for the TFGH project. While some of the code in this notebook is specific to the TFGH project, much of the code can be reused for other projects. Note that our PSUs are barangays. 

Main steps in this notebook:

1. Load list of sampled PSUs and filter rooftop data for rooftops located in sampled PSUs
3. For each sampled PSU, sample 10 rooftops
4. For each sampled rooftop, try to get the nearest point on a road
5. Save outputs as kmls


In [19]:
import geopandas as gpd
from pathlib import Path
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from shapely.geometry import LineString
from datetime import datetime
from pin_drop_sampling2.utils import count_neighbors_in_radius, get_nearest_point_on_road, dist_in_meters, get_nearest_point_on_road_batch

DB_DIR = Path.home() / 'IDinsight Dropbox' / 'Random Walk Testing' 
ROOFTOP_DIR = DB_DIR /'01_Raw data'/ '01_Rooftop'/'Philippines'
OUTPUT_DIR = DB_DIR / '03_Output' / '10_TFGH'

NUM_ROOFTOPS_PER_BRGY = 10

timestamp = datetime.now().strftime("%Y%m%d_%H")


# Filter rooftop data for rooftops located in sampled PSUs


In [3]:
sampled_brgys = pd.read_excel(DB_DIR / '01_Raw data' / '07_Sampled PSUs' / '[DQ] TFGH-CDC BeSD Master Database.xlsx', sheet_name= 'LIVE Brgy Database')
sampled_brgys.rename(columns={'barangay_id': 'PSGC'}, inplace=True)
# barangays_w_borders = gpd.read_parquet(DB_DIR / '01_Raw data'/'02_Admin boundary data'/'Philippines' / 'barangays_w_borders.parquet')

rooftops_in_sampled_brgys = gpd.GeoDataFrame()
for file in list(ROOFTOP_DIR.glob('*w_brgys.parquet')):
    print(f"Processing {file.stem}")
    df = gpd.read_parquet(file)
    temp_rooftops = df[df['PSGC'].isin(sampled_brgys['PSGC'])]
    rooftops_in_sampled_brgys = gpd.GeoDataFrame(pd.concat([rooftops_in_sampled_brgys, temp_rooftops], ignore_index=True))


Processing 3715469692580659200_w_brgys
Processing 3625397700033249280_w_brgys
Processing 3733484091090141184_w_brgys
Processing 3778520087363846144_w_brgys
Processing 3724476891835400192_w_brgys
Processing 3616390500778508288_w_brgys
Processing 3670433696306954240_w_brgys
Processing 3679440895561695232_w_brgys


# For each sampled PSU, sample 10 rooftops

In [4]:
sampled_rooftops = rooftops_in_sampled_brgys.groupby('PSGC', group_keys=False).apply(lambda x: x.sample(n=min(NUM_ROOFTOPS_PER_BRGY, x.shape[0]), random_state = 42))
print(len(sampled_rooftops))
sampled_rooftops.head(1)

1500


  sampled_rooftops = rooftops_in_sampled_brgys.groupby('PSGC', group_keys=False).apply(lambda x: x.sample(n=min(NUM_ROOFTOPS_PER_BRGY, x.shape[0]), random_state = 42))


Unnamed: 0,geometry,index_right,PSGC,s2_cell_id
27756,POINT (123.70513 13.15706),1206.0,500503015.0,3.724477e+18


# For all sampled rooftops, try to get the nearest point on a road

In [8]:
sampled_rooftops['nearest_point_on_road'] = sampled_rooftops.progress_apply(lambda x: get_nearest_point_on_road(x.geometry), axis=1)
sampled_rooftops['nearest_point_on_road'].isnull().sum()

100%|██████████| 1500/1500 [03:16<00:00,  7.62it/s]


In [11]:
sampled_rooftops['distance_to_road_m'] = sampled_rooftops.apply(lambda x: dist_in_meters(x.geometry, x.nearest_point_on_road), axis=1)

# Save sampled rooftops as csv with google map links

In [16]:
temp_samp_points = sampled_rooftops[['PSGC', 'geometry', 'nearest_point_on_road', 'distance_to_road_m']]
temp_samp_points['orig_lat'] = temp_samp_points['geometry'].y
temp_samp_points['orig_lon'] = temp_samp_points['geometry'].x
temp_samp_points['road_lat'] = temp_samp_points['nearest_point_on_road'].y
temp_samp_points['road_lon'] = temp_samp_points['nearest_point_on_road'].x
temp_samp_points = temp_samp_points.drop(columns=['geometry', 'nearest_point_on_road'])
temp_samp_points['google_maps_directions_link'] = temp_samp_points.apply(
    lambda row: f"https://www.google.com/maps/dir/{row['orig_lat']},{row['orig_lon']}/{row['road_lat']},{row['road_lon']}",
    axis=1
)
# replace google_maps_directions_link with a link to just the original point if nearest_point_on_road is null
temp_samp_points.loc[temp_samp_points['road_lat'].isna(), 'google_maps_directions_link'] = temp_samp_points.loc[
    temp_samp_points['road_lat'].isna(), ['orig_lat', 'orig_lon']
].apply(lambda x: f"https://www.google.com/maps?q={x[0]},{x[1]}", axis=1)


# save the sampled points as csv
temp_samp_points.to_csv(OUTPUT_DIR / f'samp_points_{timestamp}.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
  ].apply(lambda x: f"https://www.google.com/maps?q={x[0]},{x[1]}", axis=1)


# Save sampled rooftops as kml files

In [20]:
sampled_points_on_road = sampled_rooftops[~(sampled_rooftops['nearest_point_on_road'].isnull())]
sampled_points_off_road = sampled_rooftops[sampled_rooftops['nearest_point_on_road'].isnull()]

# save lines between original points and nearest points on road
sampled_points_on_road['line'] = sampled_points_on_road.apply(lambda row: LineString([row['geometry'], row['nearest_point_on_road']]), axis=1)
lines_gdf = sampled_points_on_road[['line', 'PSGC',]]
lines_gdf.to_file(OUTPUT_DIR / 'points_on_road_lines.kml', driver='KML')

# save sampled points on road and urban as kml
sampled_points_on_road[['nearest_point_on_road', 'PSGC']].to_file(OUTPUT_DIR / 'points_on_road.kml', driver='KML')

# save sampled points off road as kml
sampled_points_off_road[['geometry', 'PSGC']].to_file(OUTPUT_DIR / 'points_off_road.kml', driver='KML')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
  write(
  write(
