# Get barangay ID for each rooftop
I spatial join the rooftops and barangay files so that each rooftop is assigned to a single barangay and then save the rooftop file. This makes it a lot easier to sample later. Note that the saved filed only has two columns -- PSGC (i.e. barangay ID) and geomtry (rooftop centroid). Stripping the dataset down to two columns speed up processing quite a bit.

In [4]:
import geopandas as gpd
from pathlib import Path

In [5]:
DB_DIR = Path.home() / 'IDinsight Dropbox' / 'Random Walk Testing' 
ROOFTOP_DIR = DB_DIR /'01_Raw data'/ '01_Rooftop'/'Philippines'
OUTPUT_DIR = DB_DIR / '03_Output' / '05_HPLS qual'

barangays_w_borders = gpd.read_parquet(DB_DIR / '01_Raw data'/'02_Admin boundary data'/'Philippines' / 'barangays_w_borders.parquet')

In [None]:
s2_cells = barangays_w_borders['s2_cell_id'].unique()

# manually reorder by size. This helps with testing since it takes several minutes just to load the largest rooftop file
s2_cells = [3778520087363846144, 3733484091090141184, 3616390500778508288, 3679440895561695232, 3625397700033249280, 3670433696306954240, 3724476891835400192, 3715469692580659200]

# only keep columns geometry, PSGC, and s2_cell_id
barangays_w_borders = barangays_w_borders[['geometry', 'PSGC', 's2_cell_id']]
barangays_w_borders.sindex

for s2_cell_id in s2_cells:
    print(f"\nProcessing s2 cell {s2_cell_id}")
    # barangays_in_s2_cell = barangays_w_borders[barangays_w_borders['s2_cell_id'] == s2_cell_id]
    rooftops_gdf = gpd.read_parquet(ROOFTOP_DIR / f'{s2_cell_id}.parquet')

    # only keep the geometry column
    rooftops_gdf = rooftops_gdf[['geometry']]

    # replace geometry column with the centroid of the geometry
    rooftops_gdf = rooftops_gdf.set_geometry(rooftops_gdf.geometry.centroid)
    rooftops_gdf = rooftops_gdf.to_crs(epsg=4326)

    # create spatial index if it doesn't exist
    rooftops_gdf.sindex

    # Spatial join for filtering
    rooftops_in_barangays = gpd.sjoin(rooftops_gdf, barangays_w_borders, how='left', predicate='within')

    # save the rooftops in the barangays
    rooftops_in_barangays.to_parquet(ROOFTOP_DIR / f'{s2_cell_id}_w_brgys.parquet')


# Inspect rooftop files for missing barangay data
Check the number of rooftops with missing barangay data and create map showing location of rooftops with missing barangay data

In [7]:
s2_cells = [3778520087363846144, 3733484091090141184, 3616390500778508288, 3679440895561695232, 3625397700033249280, 3670433696306954240, 3724476891835400192, 3715469692580659200]


total_missing = 0
total_rooftops = 0

for s2_cell in s2_cells:
    temp = gpd.read_parquet(ROOFTOP_DIR / f'{s2_cell}_w_brgys.parquet')
    missing_count = sum(temp["PSGC"].isna())
    total_count = len(temp)
    
    total_missing += missing_count
    total_rooftops += total_count
    
    print(f'{s2_cell}: {missing_count/total_count}')

total_share_missing = total_missing / total_rooftops
print(f'Total share of rooftops with no barangay data: {total_share_missing}')

3778520087363846144: 0.002885308968502044
3733484091090141184: 0.0033855941831401953
3616390500778508288: 0.017769951891763204
3679440895561695232: 0.007441745800161212
3625397700033249280: 0.021392945755108278
3670433696306954240: 0.0033330940452083973
3724476891835400192: 0.021807979168979544
3715469692580659200: 0.001561929329977971


# Create map of rooftops with missing barangays [CODE NO LONGER NEEDED. MISSING BRGY DATA SOLVED]

In [None]:
# for the one rooftop file that has a high proportion of rooftops without barangay data, plot the location of rooftops without barangay data
import folium

file_w_lots_nas = gpd.read_parquet(ROOFTOP_DIR / '3724476891835400192_w_brgys.parquet')
rooftops_no_brgy = file_w_lots_nas[file_w_lots_nas['PSGC'].isna()]

gdf_sampled = rooftops_no_brgy.iloc[::500, :]

# Compute a center for the map based on the mean coordinates (or define your own center)
mean_lat = gdf_sampled.geometry.y.mean()
mean_lon = gdf_sampled.geometry.x.mean()

# Create a Folium map
m = folium.Map(location=[mean_lat, mean_lon], zoom_start=5)

# Add sampled points as markers
for _, row in gdf_sampled.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=2,
        color='blue',
        fill=True,
        fill_opacity=0.7
    ).add_to(m)

m

m.save(OUTPUT_DIR / 'rooftops_no_brgy.html')