# Combine

Combine together the subsets as a single data layer (geojson?)

1. make a list of all the subsets (geojson)
2. read in the 1st subset
    a. add column for uid (granuleid, or name of the file)? What does the user need to find this
    Would the user know what collection this came from?
3. read next subset (loop over remaining)
    a. add column for uid (granuleid, or name of the file)? What does the user need to find this
    b. append to the 1st subset https://geopandas.org/en/stable/docs/user_guide/mergingdata.html
4. save final geodataframe as new geojson

In [1]:
!pip install returns

[0m

In [6]:
import os
import os.path
from pathlib import Path
from typing import Any, Callable, Iterable, Mapping, TypeVar, Union

import geopandas as gpd
import pandas as pd
from gedi_utils import chext, converge, df_assign
from profilehooks import timecall
from returns.curry import curry, partial
from returns.functions import tap
from returns.io import impure_safe, IOResult, IOResultE, IOSuccess
from returns.iterables import Fold
from returns.pipeline import flow
from returns.pointfree import bimap, bind_ioresult, map_
from returns.result import Success
from returns.unsafe import unsafe_perform_io

_E = TypeVar('_E', bound=Exception)

In [9]:
@curry
def append_error_message(extra_message: str, e: _E) -> _E:
    message, *other_args = e.args if e.args else tuple(None, )
    new_message = f'{message}: {extra_message}' if message else extra_message
    e.args = (new_message, *other_args)
    
    return e
    

@curry
def gdf_to_file(
    path: Union[str, os.PathLike],
    props: Mapping[str, Any],
    gdf: gpd.GeoDataFrame
) -> IOResultE[None]:
    # Unfortunately, using mode='a' when the target file does not exist throws an
    # exception rather than simply creating a new file.  Therefore, in that case, we
    # switch to mode='w' to avoid the error.
    mode = props.get('mode')
    props = dict(props, mode='w' if mode == 'a' and not os.path.exists(path) else mode)
    
    return impure_safe(gdf.to_file)(path, **props).alt(append_error_message(f'{props}'))


@curry
def combine_to_gpkg(
    dest_path: Union[str, os.PathLike],
    src_paths: Iterable[Union[str, os.PathLike]]
) -> IOResultE[None]:
    def gpd_read_with_filename(path: str) -> IOResultE[gpd.GeoDataFrame]:
        return flow(
            impure_safe(gpd.read_file)(path),
            bimap(
                df_assign('filename', chext('.h5', os.path.basename(path))),
                append_error_message(f'source {path}')
            ),
        )
    
    return flow(
        src_paths,
        partial(map, gpd_read_with_filename),
        partial(map, bind_ioresult(gdf_to_file(dest_path)({'mode': 'a'}))),
        partial(map, IOResult.swap),
        partial(Fold.collect_all, acc=IOSuccess(())),
        IOResult.swap,
    )

In [10]:
dirpath = Path('/') / 'projects' / 'my-public-bucket' / 'gedi-l4a' / 'gabon'
gpkg_filename = Path('/') / 'projects' / 'tmp' / 'gabon.gpkg'

# if os.path.exists(gpkg_filename):
#     os.remove(gpkg_filename)

# result = timecall(combine_to_gpkg)(gpkg_filename, dirpath.glob('*.fgb'))
# result = timecall(combine_to_gpkg)(gpkg_filename, [
#     '/projects/my-public-bucket/gedi-l4a/gabon/GEDI04_A_2019114135421_O02061_04_T00905_02_002_01_V002.fgb',
#     '/projects/my-public-bucket/gedi-l4a/gabon/GEDI04_A_2019115021551_O02069_01_T03606_02_002_01_V002.fgb',
#     '/projects/my-public-bucket/gedi-l4a/gabon/GEDI04_A_2019115021551_O02069_02_T03606_02_002_01_V002.fgb'
# ])
# unsafe_perform_io(result)


  combine_to_gpkg (/tmp/ipykernel_11222/1208387644.py:25):
    47.074 seconds



<Failure: ()>

There were 3 files that were initially incorrect and had to be regenerated:

```plain
/projects/my-public-bucket/gedi-l4a/gabon/GEDI04_A_2019114135421_O02061_04_T00905_02_002_01_V002.fgb
/projects/my-public-bucket/gedi-l4a/gabon/GEDI04_A_2019115021551_O02069_01_T03606_02_002_01_V002.fgb
/projects/my-public-bucket/gedi-l4a/gabon/GEDI04_A_2019115021551_O02069_02_T03606_02_002_01_V002.fgb
```

This was the original output:

```plain
<Failure: (ValueError("Record does not match collection schema: dict_keys(['BEAM', 'agbd', 'agbd_se', 'filename', 'l4_quality_flag', 'lat_lowestmode', 'lon_lowestmode', 'sensitivity']) != ['BEAM', 'agbd', 'agbd_se', 'sensitivity', 'l4_quality_flag', 'filename']: {'mode': 'a'}"), ValueError("Record does not match collection schema: dict_keys(['BEAM', 'agbd', 'agbd_se', 'filename', 'l4_quality_flag', 'lat_lowestmode', 'lon_lowestmode', 'sensitivity']) != ['BEAM', 'agbd', 'agbd_se', 'sensitivity', 'l4_quality_flag', 'filename']: {'mode': 'a'}"), ValueError("Record does not match collection schema: dict_keys(['BEAM', 'agbd', 'agbd_se', 'filename', 'l4_quality_flag', 'lat_lowestmode', 'lon_lowestmode', 'sensitivity']) != ['BEAM', 'agbd', 'agbd_se', 'sensitivity', 'l4_quality_flag', 'filename']: {'mode': 'a'}"))>
```

The error messages do not include the filenames, so the code needs to be enhanced to capture the corresponding filenames.  The filenames above were obtained from output generated during execution (and each was followed by an error message).