# Overview

This notebook selects 20 rooftops from each barangay.

In [21]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

In [22]:
DB_DIR = Path.home() / 'IDinsight Dropbox' / 'Random Walk Testing' 
ROOFTOP_DIR = DB_DIR /'01_Raw data'/ '01_Rooftop'/'Philippines'
OUTPUT_DIR = DB_DIR / '03_Output' / '05_HPLS qual'

timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')

In [23]:
sampled = gpd.GeoDataFrame()

# Loop over all files ending with "w_brgys.parquet" in ROOFTOP_DIR and sample up to 20 rows from each barangay
for parquet_file in ROOFTOP_DIR.glob("*w_brgys.parquet"):
    # Read the parquet file
    df = gpd.read_parquet(parquet_file)
    
    # Group by PSGC, then sample up to 20 rows from each group
    temp_sampled = (
        df.groupby("PSGC")[['PSGC', 'geometry']]
        .apply(lambda group: group.sample(n=min(len(group), 20), random_state=42))
    )

    # Append the sampled data to the output GeoDataFrame
    sampled = pd.concat([sampled, temp_sampled])

    print(f"Processing {parquet_file.name}")


Processing 3715469692580659200_w_brgys.parquet
Processing 3625397700033249280_w_brgys.parquet
Processing 3733484091090141184_w_brgys.parquet
Processing 3778520087363846144_w_brgys.parquet
Processing 3724476891835400192_w_brgys.parquet
Processing 3616390500778508288_w_brgys.parquet
Processing 3670433696306954240_w_brgys.parquet
Processing 3679440895561695232_w_brgys.parquet


In [24]:
# for each barangay, sample 20 rooftops again since a very small share of barangays straddle S2 cell boundaries
sampled.reset_index(drop=True, inplace=True)
final_sampled = (
        sampled.groupby("PSGC")[['PSGC', 'geometry']]
        .apply(lambda group: group.sample(n=min(len(group), 20), random_state=42))
    )

In [None]:
# save final sampled as parquet and csv
final_sampled.to_parquet(OUTPUT_DIR / f'sampled_rooftops_{timestamp}.parquet')

# save final sampled as csv but create a lat and lon column
final_sampled['lat'] = final_sampled.geometry.y
final_sampled['lon'] = final_sampled.geometry.x
final_sampled.drop(columns=['geometry']).to_csv(OUTPUT_DIR / f'sampled_rooftops_{timestamp}.csv', index=False)