In [1]:
import os
import geopandas as gpd
import numpy as np
import pandas as pd
import fsspec
import logging

from deafrica_conflux.id_field import guess_id_field
from deafrica_conflux.cli.logs import logging_setup
from deafrica_conflux.io import check_if_s3_uri, check_dir_exists

In [2]:
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables. 
aws_default_config = {
    #'AWS_NO_SIGN_REQUEST': 'YES', 
    'AWS_SECRET_ACCESS_KEY': 'fake',
    'AWS_ACCESS_KEY_ID': 'fake',
}

# To access public bucket, need to remove the AWS credentials in 
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
polygons_vector_file = "s3://deafrica-waterbodies-dev/0-2-0/shapefile/continentalwaterbodiesv0_2_0.shp"
use_id = "UID"
verbose =  1
wofs_ls_regions_file_path = "https://explorer.digitalearth.africa/api/regions/wofs_ls"
output_directory = "s3://deafrica-waterbodies-dev/0-2-0/test_directory/"

In [4]:
# Set up logger.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [5]:
%%time
# Read in the continental waterbodies.
polygons_gdf = gpd.read_file(polygons_vector_file)

# Guess the ID field.
id_field = guess_id_field(polygons_gdf, use_id)
_log.info(f"Guessed ID field: {id_field}")

# Set the ID field as the index.
polygons_gdf.set_index(id_field, inplace=True)

[2023-10-26 14:07:47,739] {id_field.py:64} INFO - Values in the column UID are unique.
[2023-10-26 14:07:47,740] {<timed exec>:6} INFO - Guessed ID field: UID
CPU times: user 2min 54s, sys: 2.87 s, total: 2min 57s
Wall time: 3min 36s


In [6]:
%%time
# Load the WOFS regions
wofs_ls_regions = gpd.read_file(wofs_ls_regions_file_path).to_crs(polygons_gdf.crs)
wofs_ls_regions.set_index("region_code", inplace=True)

CPU times: user 1.13 s, sys: 4.05 ms, total: 1.14 s
Wall time: 1.59 s


In [7]:
%%time
# Split each row in the wofs_ls_regions into a GeoDataFrame of its own.
regions = np.array_split(wofs_ls_regions, len(wofs_ls_regions))
assert len(regions) == len(wofs_ls_regions)

CPU times: user 6.12 s, sys: 0 ns, total: 6.12 s
Wall time: 6.11 s


In [8]:
def get_intersecting_polygons_ids(region, polygons_gdf):
    intersecting_polygons_ids = gpd.sjoin(polygons_gdf, region, how="inner", predicate="intersects").index.to_list()
    region["intersecting_polygons_ids"] = ','.join(intersecting_polygons_ids)
    return region

In [9]:
%%time
regions_ = [get_intersecting_polygons_ids(region, polygons_gdf) for region in regions]

CPU times: user 14min 4s, sys: 7.4 s, total: 14min 12s
Wall time: 14min 12s


In [10]:
%%time
# Filter to remove regions with no intersecting polygons.
filtered_regions = [region for region in regions_ if region.iloc[0].intersecting_polygons_ids]

CPU times: user 143 ms, sys: 0 ns, total: 143 ms
Wall time: 142 ms


In [11]:
filtered_regions_gdf =  pd.concat(filtered_regions)

In [12]:
filtered_regions_gdf

Unnamed: 0_level_0,label,count,geometry,intersecting_polygons_ids
region_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
157064,157064,347,"POLYGON ((53.77571 -6.83266, 53.77104 -6.83167...","mnvzhyp5w,mnvzjp44s"
157069,157069,29,"POLYGON ((50.90030 -12.06436, 50.90113 -12.060...","mjdffzxqv,mjdfgheym,mjdfgk052,mjdfj35k6,mjdfjk..."
157070,157070,349,"POLYGON ((48.61911 -15.11534, 49.01864 -13.400...","mj69xrcen,mj69z0yep,mj69z1m5n,mj69z282x,mj69z5..."
157071,157071,404,"POLYGON ((48.68508 -14.84417, 49.52783 -15.023...","mhgx9nff3,mhgx9p0q3,mhgxb1gs8,mhgxbwdss,mhgxc0..."
157072,157072,326,"POLYGON ((51.27630 -18.39161, 51.27299 -18.390...","mhep3dz02,mhepfhhgg,mhepfw4wj,mhepfx29v,mhepfx..."
...,...,...,...,...
206047,206047,712,"POLYGON ((-17.94439 19.82604, -17.94341 19.830...","eehtejfnd,eehtepg5f,eehtg1y99,eehtg45y8,eehtgb..."
206048,206048,306,"POLYGON ((-17.06474 16.29312, -17.78804 16.446...","edukb616m,edukb69ch,edukbkkru,edukbmwpw,edukcm..."
206049,206049,311,"POLYGON ((-18.14720 15.00996, -19.70731 15.232...","edeuevuud,edeugk6n7,edeuvd8ed,edeuzbh4s,edevkz..."
206050,206050,695,"POLYGON ((-17.59234 13.40720, -17.59939 13.408...","edecpgwx7,edecpmsbq,edecpmspn,edecps9z1,edecpx..."


In [13]:
if not check_dir_exists(output_directory):
    if check_if_s3_uri(output_directory):
        fs = fsspec.filesystem("s3")
    else:
        fs = fsspec.filesystem("file")
        
    fs.mkdirs(output_directory, exist_ok=True)
    _log.info(f"Created directory {output_directory}")

[2023-10-26 14:22:08,222] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [14]:
def export_polygons(row, polygons_gdf, output_directory):
    region_id = row.name
    polygon_ids = row.intersecting_polygons_ids.split(",")
    
    output_fp = os.path.join(output_directory, f"{region_id}.parquet")
    polygons_gdf.loc[polygon_ids].reset_index().to_parquet(output_fp)
    _log.info(f"Polygons for region {region_id} written to {output_fp}")
    return output_fp

In [15]:
%%time
filtered_regions_gdf["polygon_file_paths"] = filtered_regions_gdf.apply(lambda row: export_polygons(row, polygons_gdf, output_directory), axis=1)

[2023-10-26 14:22:08,655] {364763189.py:7} INFO - Polygons for region 157064 written to s3://deafrica-waterbodies-dev/0-2-0/test_directory/157064.parquet
[2023-10-26 14:22:08,867] {364763189.py:7} INFO - Polygons for region 157069 written to s3://deafrica-waterbodies-dev/0-2-0/test_directory/157069.parquet
[2023-10-26 14:22:09,089] {364763189.py:7} INFO - Polygons for region 157070 written to s3://deafrica-waterbodies-dev/0-2-0/test_directory/157070.parquet
[2023-10-26 14:22:09,318] {364763189.py:7} INFO - Polygons for region 157071 written to s3://deafrica-waterbodies-dev/0-2-0/test_directory/157071.parquet
[2023-10-26 14:22:09,584] {364763189.py:7} INFO - Polygons for region 157072 written to s3://deafrica-waterbodies-dev/0-2-0/test_directory/157072.parquet
[2023-10-26 14:22:09,816] {364763189.py:7} INFO - Polygons for region 157073 written to s3://deafrica-waterbodies-dev/0-2-0/test_directory/157073.parquet
[2023-10-26 14:22:10,053] {364763189.py:7} INFO - Polygons for region 157074