# Overview
This notebook creates a barangay dataset with barangay borders (in cases where we have data on the barangay boundary data) and PCWHS. 

I use barangay data (with PCWHS designation) provided by Gio.

I use barangay data from [here](https://github.com/altcoder/philippines-psgc-shapefiles) since it appears to be higher quality than the dataset on [HDX](https://data.humdata.org/dataset/cod-ab-phl/resource/12457689-6a86-4474-8032-5ca9464d38a8)



In [11]:
import geopandas as gpd
from pathlib import Path
import pandas as pd
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()
from pin_drop_sampling2.utils import get_s2_cell_id

# Import and clean barangay borders

In [12]:
DB_DIR = Path.home() / 'IDinsight Dropbox' / 'Random Walk Testing' 
ROOFTOP_DIR = DB_DIR /'01_Raw data'/ '01_Rooftop'/'Philippines'
OUTPUT_DIR = DB_DIR / '03_Output' / '05_HPLS qual'

timestamp = datetime.now().strftime("%Y%m%d_%H")

In [13]:
# import barangay census data, rename the PSGC column, and convert to numeric
barangay_census = pd.read_stata(Path.home() / 'IDinsight Dropbox' / 'DOH HPLS Phase 2 CB Qual - ETF'/'1 Capacity Building'/'1 Sample Size Calculations'/'psgc_barangays.dta')
barangay_census.rename(columns={'digitPSGC':'PSGC'}, inplace=True)
barangay_census['PSGC'] = pd.to_numeric(barangay_census['PSGC'], errors='coerce')

# import altcoder barangay borders data
barangay_borders_altcoder = gpd.read_file(DB_DIR / '01_Raw data' / '02_Admin boundary data' / 'Philippines' / 'PH_Adm4_BgySubMuns.shp'/'PH_Adm4_BgySubMuns.shp.shp')
barangay_borders_altcoder.rename(columns={'adm4_psgc':'PSGC'}, inplace=True)
barangay_borders_altcoder.to_crs(epsg=4326, inplace=True)
barangay_borders_altcoder = barangay_borders_altcoder[['PSGC', 'geometry']]

# import hdx barangay borders data and clean it up. I DON'T USE THIS BUT AM KEEPING IT HERE FOR REFERENCE
barangay_borders_hdx = gpd.read_file(DB_DIR / '01_Raw data' / '02_Admin boundary data' / 'Philippines' / 'phl_adm_psa_namria_20231106_shp'/'phl_admbnda_adm4_psa_namria_20231106.shp')
barangay_borders_hdx['PSGC'] = pd.to_numeric(barangay_borders_hdx['ADM4_PCODE'].str[2:], errors='coerce')
barangay_borders_hdx = barangay_borders_hdx[['PSGC', 'geometry']]
# merge the two datasets
barangays = barangay_census.merge(barangay_borders_altcoder, on="PSGC", how ='left')

# print the length of barangays, barangay_census, and barangay_borders
print(f"Merged: {len(barangays)}, Borders: {len(barangay_borders_altcoder)}, Census: {len(barangay_census)}, Rows from census without borders: {sum(barangays.geometry.isna())}")

# create barangays_w_borders by removing rows with no geometry
barangays_w_borders = barangays[~barangays.geometry.isna()]

# create barangays_wo_borcders by keeping rows with no geometry
barangays_wo_borders = barangays[barangays.geometry.isna()]

Merged: 42001, Borders: 42017, Census: 42001, Rows from census without borders: 0


In [15]:
# turn it into a gdf with appropriate CRS
barangays_w_borders = gpd.GeoDataFrame(barangays_w_borders, geometry='geometry', crs='EPSG:4326')
barangays_w_borders.to_crs(epsg=4326, inplace=True)

# get the s2 cell id for each barangay
barangays_w_borders['s2_cell_id'] = barangays_w_borders.apply(lambda x: get_s2_cell_id(x.geometry.centroid, 4), axis=1)
# save barangays_w_borders to file
barangays_w_borders.to_parquet(DB_DIR / '01_Raw data'/'02_Admin boundary data'/'Philippines' / 'barangays_w_borders.parquet')

# if len(barangays_wo_borders) > 0 save barangays_wo_borders to file
if len(barangays_wo_borders) > 0:
    barangays_wo_borders[['PSGC', 'brgy_name', 'reg_code', 'reg_name', 'prov_code', 'prov_name']].to_csv(DB_DIR / '01_Raw data'/'02_Admin boundary data'/'Philippines' / 'barangays_wo_borders.csv')

# [OLD - NO LONGER NEEDED] Deal with Negros island barangays
There used to be an issue merging the population data from Gio with the boundary data because the Negros region split and thus all of the PSGC codes changes. It seems like Gio has updated the population data with a new file and now it works fine without this code.

In [5]:
# create dataset of census data without borders and drop the geometry column
census_no_map = barangays[barangays.geometry.isna()]
census_no_map.drop(columns=['geometry'], inplace=True)

# create dataset of borders without census data
map_no_census = barangay_borders_altcoder[~barangay_borders_altcoder.PSGC.isin(barangays.PSGC)]

# for any PSGC codes that are in the 60s or 70s, replace 60 or 70 with 180
map_no_census['PSGC'] = map_no_census['PSGC'].astype(str)
map_no_census['PSGC'] = map_no_census['PSGC'].apply(lambda x: '180' + x[2:] if x[:2] in ['60', '70'] else x)
map_no_census['PSGC'] = pd.to_numeric(map_no_census['PSGC'])

# merge the two datasets
merged_negros = census_no_map.merge(map_no_census, on='PSGC', how='inner')

# append merged_negros to barangays_w_borders
barangays_w_borders = pd.concat([barangays_w_borders, merged_negros], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  census_no_map.drop(columns=['geometry'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
