# 0.1.4: Build GBIF trait maps

The final step before training models using Earth observation (EO) data is to link the TRY trait data with the GBIF species observations and then grid them. In this way, we can have matching trait rasters to be paired with our EO data.

## Imports and config

In [6]:
from pathlib import Path

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import dask_geopandas as dgpd
import geopandas as gpd
import numpy as np
import pandas as pd
from src.conf.conf import get_config
from src.conf.environment import log

%load_ext autoreload
%autoreload 2

# Display all columns when printing a pandas DataFrame
pd.set_option("display.max_columns", None)

cfg = get_config()

cluster = LocalCluster(dashboard_address=":39143")
client = Client(cluster)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Perhaps you already have a cluster running?
Hosting the HTTP server on port 33039 instead


## Load GBIF and filter by PFT

Let's load the GBIF data and select only the "Tree" PFT.

In [2]:
def filter_pft(df: pd.DataFrame, pft_set: str, pft_col: str = "pft") -> pd.DataFrame:
    pfts = pft_set.split("_")
    if not any(pft in ["Shrub", "Tree", "Grass"] for pft in pfts):
        raise ValueError(f"Invalid PFT designation: {pft_set}")

    return df[df[pft_col].isin(pfts)]


gbif = (
    dd.read_parquet(Path(cfg.gbif.interim.dir, cfg.gbif.interim.subsampled))
    .pipe(filter_pft, "Tree")
    .repartition(npartitions=60)
    .sample(frac=0.01)
    .set_index("speciesname")
)

## Load TRY filtered mean trait data

In [3]:
mn_traits = (
    dd.read_parquet(Path(cfg.trydb.interim.dir, cfg.trydb.interim.filtered))
    .repartition(npartitions=60)
    .set_index("speciesname")
)

## Link mean trait values with GBIF data

In [5]:
merged = gbif.join(mn_traits, how="inner").reset_index()

In [14]:
# compute the number of unique indices
print(
    f"Pct matched species: {merged.index.nunique().compute() / gbif.index.nunique():.2%}"
)

Pct matched species: 61.89%


## Rasterize merged trait values

In [None]:
def global_grid_data(df, long, lat, deg, variables):
    """
    Source: https://sojwolf.github.io/iNaturalist_traits/Chapter_6_Compare_trait_maps_sPlot_iNat.html#grid-mean-trait-values-at-different-resolutions
    """
    # create new dataframe to save the average value of each grid cell and variable
    grouped_df = dd.from_pandas(pd.DataFrame(), npartitions=df.npartitions)

    # convert degree into step size
    step = int((360 / deg) + 1)

    bins_x = np.linspace(-180, 180, step)
    bins_y = np.linspace(-90, 90, int(((step - 1) / 2) + 1))

    # group latitude and longitude coordinates into bins
    # create new columns 'x_bin' and 'y_bin'
    df["x_bin"] = df.map_partitions(pd.cut, long, bins=bins_x)
    df["y_bin"] = df.map_partitions(pd.cut, lat, bins=bins_y)

    # raster coordinates are in center of raster cell
    df["x_bin"] = df["x_bin"].map(lambda x: ((x.left + x.right) / 2))
    df["y_bin"] = df["y_bin"].map(lambda x: ((x.left + x.right) / 2))

    grouped_df = df.drop_duplicates(subset=["x_bin", "y_bin"]).persist()

    for v in variables:

        sub_df = df[["y_bin", "x_bin", v]]
        grouped_v = sub_df.groupby(["x_bin", "y_bin"])[v].mean().reset_index()

        grouped_df = dd.merge(grouped_df, grouped_v, on=["x_bin", "y_bin"], how="left")

    return grouped_df.compute()

In [7]:
client.close()
cluster.close()