# 0.1.4: Build GBIF trait maps

The final step before training models using Earth observation (EO) data is to link the TRY trait data with the GBIF species observations and then grid them. In this way, we can have matching trait rasters to be paired with our EO data.

## Imports and config

In [1]:
from pathlib import Path

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import dask_geopandas as dgpd
import geopandas as gpd
import numpy as np
import pandas as pd
from src.conf.conf import get_config
from src.conf.environment import log

%load_ext autoreload
%autoreload 2

# Display all columns when printing a pandas DataFrame
pd.set_option("display.max_columns", None)

cfg = get_config()

cluster = LocalCluster(dashboard_address=":39143")
client = Client(cluster)

## Load GBIF and filter by PFT

Let's load the GBIF data and select only the "Tree" PFT.

In [2]:
def filter_pft(df: pd.DataFrame, pft_set: str, pft_col: str = "pft") -> pd.DataFrame:
    pfts = pft_set.split("_")
    if not any(pft in ["Shrub", "Tree", "Grass"] for pft in pfts):
        raise ValueError(f"Invalid PFT designation: {pft_set}")

    return df[df[pft_col].isin(pfts)]


gbif = (
    dd.read_parquet(Path(cfg.gbif.interim.dir, cfg.gbif.interim.subsampled))
    .pipe(filter_pft, "Tree")
    .repartition(npartitions=60)
    .sample(frac=0.01)
    .set_index("speciesname")
)

## Load TRY filtered mean trait data

In [3]:
mn_traits = (
    dd.read_parquet(Path(cfg.trydb.interim.dir, cfg.trydb.interim.filtered))
    .repartition(npartitions=60)
    .set_index("speciesname")
)

## Link mean trait values with GBIF data

In [4]:
merged = gbif.join(mn_traits, how="inner").reset_index()

In [14]:
# compute the number of unique indices
print(
    f"Pct matched species: {merged.index.nunique().compute() / gbif.index.nunique():.2%}"
)

Pct matched species: 61.89%


## Rasterize merged trait values

In [5]:
def global_grid_data(
    df: pd.DataFrame,
    traits: list[str],
    lon: str = "decimallongitude",
    lat: str = "decimallatitude",
    res: int | float = 0.5,
    stat: str = "mean",
):
    """
    Modified from:
    https://sojwolf.github.io/iNaturalist_traits/Chapter_6_Compare_trait_maps_sPlot_iNat.html#grid-mean-trait-values-at-different-resolutions
    """
    # convert resolution in degrees into step size
    step = int((360 / res) + 1)

    bins_x = np.linspace(-180, 180, step)
    bins_y = np.linspace(-90, 90, int(((step - 1) / 2) + 1)))

    df["x_bin"] = df[lon].map_partitions(
        pd.cut, bins=bins_x, meta=pd.Series(dtype="category", name="x_bin")
    )
    df["y_bin"] = df[lat].map_partitions(
        pd.cut, bins=bins_y, meta=pd.Series(dtype="category", name="y_bin")
    )

    df["x_bin"] = df["x_bin"].map(
        lambda x: ((x.left + x.right) / 2),
        meta=pd.Series(dtype="category", name="x_bin"),
    )
    df["y_bin"] = df["y_bin"].map(
        lambda x: ((x.left + x.right) / 2),
        meta=pd.Series(dtype="category", name="y_bin"),
    )

    # group by 'x_bin' and 'y_bin' and calculate the mean of all variables
    grouped_df = df.groupby(["x_bin", "y_bin"])[traits].mean().reset_index()

    return grouped_df

In [5]:
res = 0.5
lon = "decimallongitude"
lat = "decimallatitude"
traits = [col for col in merged.columns if col.startswith("X")]

# convert resolution in degrees into step size
step = int((360 / res) + 1)

bins_x = np.linspace(-180, 180, step)
bins_y = np.linspace(-90, 90, int(((step - 1) / 2) + 1))


In [6]:
df = merged.copy()
# group latitude and longitude coordinates into bins
# df["x_bin"] = df.map_partitions(
#     pd.cut, df[lon], bins=bins_x, meta=pd.Series(dtype="category", name="x_bin")
# )
# df["y_bin"] = df.map_partitions(
#     pd.cut, df[lat], bins=bins_y, meta=pd.Series(dtype="category", name="y_bin")
# )

df["x_bin"] = df[lon].map_partitions(
    pd.cut, bins=bins_x, meta=pd.Series(dtype="category", name="x_bin")
)
df["y_bin"] = df[lat].map_partitions(
    pd.cut, bins=bins_y, meta=pd.Series(dtype="category", name="y_bin")
)

In [19]:
binned = df.compute()

In [7]:
# df = df.dropna(subset=["x_bin", "y_bin"])

# raster coordinates are in center of raster cell
df["x_bin"] = df["x_bin"].map(
    lambda x: ((x.left + x.right) / 2), meta=pd.Series(dtype="category", name="x_bin")
)
df["y_bin"] = df["y_bin"].map(
    lambda x: ((x.left + x.right) / 2), meta=pd.Series(dtype="category", name="y_bin")
)

# df = df.drop(columns=["x_bin", "y_bin"])
# group by 'x_bin' and 'y_bin' and calculate the mean of all variables
grouped_df = df.groupby(["x_bin", "y_bin"])[traits].mean().reset_index()

grouped_df = grouped_df.compute()

  self._meta = self.obj._meta.groupby(




In [8]:
grouped_df.dropna()

Unnamed: 0,x_bin,y_bin,X4,X6,X11,X13,X14,X15,X18,X21,X26,X27,X46,X47,X50,X55,X78,X95,X138,X144,X145,X146,X163,X169,X223,X224,X237,X281,X282,X289,X1080,X3112,X3113,X3114,X3120
3269,-175.25,-21.25,0.569337,1.972586,15.370954,449.539291,22.975328,1.864955,15.915791,0.216165,218.235354,7.331268,0.222534,0.326093,1.709823,664.670476,3.948594,88.723634,5566.593951,104.012288,4.746386,21.947743,2.239603,14.693139,28.947630,2.385356,7.890435,59.068821,500.206847,1150.865245,1745.356198,9232.734102,9218.125960,17650.540170,3.142359
4434,-173.75,65.75,0.462235,0.201120,12.664687,508.046182,19.083941,1.245758,0.098124,0.019702,0.548580,2.575721,0.206528,0.325303,1.774142,9.713457,-5.184818,92.470312,45.777636,31.014818,0.864417,27.582798,0.029724,462.886371,26.121042,1.208883,3.182376,61.457423,491.853771,574.579679,1105.876873,60.731046,229.184270,108.694375,3.508671
8335,-168.25,64.75,0.421943,0.140086,12.956570,475.797643,24.325818,2.414240,0.249802,0.090855,0.049918,0.778593,0.168948,0.315096,1.940315,9.405572,-4.350750,66.694891,763.211107,47.009809,1.801611,19.193023,0.029020,711.597819,43.943043,1.288748,1.636962,47.136362,436.888394,522.854045,2487.585396,126.693578,136.237565,235.012470,3.250676
13359,-161.25,68.75,0.462235,0.201120,12.664687,508.046182,19.083941,1.245758,0.098124,0.019702,0.548580,2.575721,0.206528,0.325303,1.774142,9.713457,-5.184818,92.470312,45.777636,31.014818,0.864417,27.582798,0.029724,462.886371,26.121042,1.208883,3.182376,61.457423,491.853771,574.579679,1105.876873,60.731046,229.184270,108.694375,3.508671
15979,-157.75,21.75,0.629438,1.264862,9.282858,486.851385,14.493004,1.482781,14.346453,0.215739,8.460650,3.484125,0.380863,0.426404,1.693509,256.897078,1.801760,90.290489,958.732719,134.039531,3.896124,49.323670,0.613837,18.918036,25.103260,2.537516,3.602290,42.251447,504.872606,1146.980222,2543.413246,2379.485520,2725.432639,3838.161652,1.929482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514973,175.25,-36.25,0.573788,1.819138,10.665787,463.571900,13.974459,1.045903,8.301112,0.225915,89.220419,4.786983,0.331270,0.347822,1.549724,313.279675,-0.078803,84.785691,9375.114314,93.789889,2.614457,41.559851,0.869664,106.935667,27.137057,6.823957,8.169148,25.693408,449.459283,925.741687,2533.021416,3688.151119,2487.386806,2825.098797,3.153538
515156,175.25,-39.25,0.546094,1.386742,10.230105,474.418247,14.314606,1.065100,8.861077,0.369441,103.833423,4.880458,0.309988,0.339535,1.673944,199.480233,-0.940683,83.059650,3462.487147,65.316027,2.130266,41.314590,0.586847,124.207658,31.470961,7.768926,8.495024,24.789521,456.609999,911.233183,2680.472417,1481.110918,1614.456104,1822.304369,3.295592
515187,175.75,-40.25,0.507803,1.349148,11.594427,473.322846,16.628457,1.278653,10.073231,0.488639,46.288030,3.371737,0.274759,0.318959,1.542021,179.011886,-0.573889,80.944644,5238.750448,68.447600,2.361401,37.588057,0.593797,98.282473,33.011402,6.004482,6.306552,35.520964,371.393312,757.354425,3170.307241,1916.112398,1435.640763,2533.230571,3.610609
515770,176.25,-43.75,0.802922,0.216543,9.993999,485.673288,16.143686,1.200853,6.922842,0.067042,50.296305,5.262157,0.293427,0.354325,1.675774,140.220613,0.663207,89.211634,1031.757910,40.025691,1.196487,34.371926,0.424474,30.927499,41.068287,3.543645,3.696145,56.282222,824.096719,1682.111998,1005.067256,1238.272775,1158.628253,3059.873741,2.980288


In [6]:
cols = [col for col in merged.columns if col.startswith("X")]
grid_data = global_grid_data(merged.copy(), cols)

ValueError: Metadata inference failed in `map`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
AttributeError("'float' object has no attribute 'left'")

Traceback:
---------
  File "/home/dl1070/micromamba/envs/traits/lib/python3.12/site-packages/dask/dataframe/utils.py", line 195, in raise_on_meta_error
    yield
  File "/home/dl1070/micromamba/envs/traits/lib/python3.12/site-packages/dask_expr/_expr.py", line 3983, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dl1070/micromamba/envs/traits/lib/python3.12/site-packages/dask/utils.py", line 1241, in __call__
    return getattr(__obj, self.method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dl1070/micromamba/envs/traits/lib/python3.12/site-packages/pandas/core/series.py", line 4691, in map
    new_values = self._map_values(arg, na_action=na_action)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dl1070/micromamba/envs/traits/lib/python3.12/site-packages/pandas/core/base.py", line 919, in _map_values
    return arr.map(mapper, na_action=na_action)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dl1070/micromamba/envs/traits/lib/python3.12/site-packages/pandas/core/arrays/categorical.py", line 1563, in map
    na_val = mapper(np.nan) if callable(mapper) else mapper.get(np.nan, np.nan)
             ^^^^^^^^^^^^^^
  File "/tmp/ipykernel_17772/1128391950.py", line 29, in <lambda>
    df["x_bin"] = df["x_bin"].map(lambda x: ((x.left + x.right) / 2))
                                              ^^^^^^


In [4]:
client.close()
cluster.close()