# 2.1.3: Product benchmarking with sPlot


These trait maps are certainly not the first to be made, and there are several other great attempts at the challenging task of producing global trait maps. This raises an important question for both trait map creators and consumers: which maps should be used and for which purposes?

Here we propose the use of the global vegetation plot dataset sPlot as a benchmark against which existing trait products can be compared, as sPlot is the only global-scale dataset that contains plot-level trait estimates, generally avoiding the biases and pitfalls that come with crowd-sourced species observations.

The trait maps we will be comparing against sPlot are:

- Boonman et al., 2020
- Butler et al., 2017
- Dong et al., 2023
- Schiller et all, 2021
- Madani et al., 2018
- Moreno et al. 2018
- Vallicrosa et al., 2022
- van Bodegom et al., 2014
- Wolf et al., 2022


## Imports and config


In [1]:
import pandas as pd

from src.conf.conf import get_config
from src.conf.environment import log

cfg = get_config()

Get correlations with sPlot for each product.


In [None]:
from pathlib import Path
import xarray as xr

from src.utils.dataset_utils import get_trait_map_fns
from src.utils.raster_utils import open_raster
from src.utils.spatial_utils import lat_weights, weighted_pearson_r


def raster_correlation(
    fn_left: Path, fn_right: Path, resolution: int | float
) -> tuple[str, float]:
    """Calculate the weighted Pearson correlation coefficient between a pair of trait maps."""
    log.info("Loading and filtering data for %s...", fn_right.stem)
    r_left = open_raster(fn_left).sel(band=1)
    r_right = open_raster(fn_right).sel(band=1)

    # Ensure the rasters are aligned
    r_right = r_right.rio.reproject_match(r_left)

    df_left = (
        r_left.to_dataframe(name=f"left_{fn_left.stem}")
        .drop(columns=["band", "spatial_ref"])
        .dropna()
    )
    df_right = (
        r_right.to_dataframe(name=f"right_{fn_right.stem}")
        .drop(columns=["band", "spatial_ref"])
        .dropna()
    )

    log.info("Joining dataframes (%s)...", fn_right.stem)
    df = df_left.join(df_right, how="inner")

    lat_unique = df.index.get_level_values("y").unique()

    log.info("Calculating weights (%s)...", fn_right.stem)
    weights = lat_weights(lat_unique, resolution)

    log.info(
        "Calculating weighted Pearson correlation coefficient (%s)...", fn_right.stem
    )
    r = weighted_pearson_r(df, weights)

    log.info("Weighted Pearson correlation coefficient: %s", r)

    return fn_right.stem, r

In [None]:
from src.utils.dataset_utils import get_trait_maps_dir


def all_products_paths() -> list[Path]:
    """Get the paths to all products."""
    products_dir = Path("data/interim/other_trait_maps")
    data = []
    for subdir in products_dir.iterdir():
        if subdir.is_dir():
            for file in subdir.glob("**/*"):
                if file.is_file():
                    data.append(file)
    return data


def gather_results() -> pd.DataFrame:
    """Gather the results of the raster correlation analysis into a DataFrame."""
    splot_corr_path = Path("results/product_comparison.parquet")
    if splot_corr_path.exists():
        log.info("Loading existing results...")
        splot_corr = pd.read_parquet(splot_corr_path)
    else:
        splot_corr = pd.DataFrame(columns=["trait_id", "author", "r", "resolution"])

    for fn in all_products_paths():
        res = fn.parent.stem
        if res != cfg.model_res:
            continue
        trait_id, author = fn.stem.split("_")
        splot_path = get_trait_maps_dir("splot") / f"{trait_id}.tif"
        _, r = raster_correlation(splot_path, fn, cfg.target_resolution)

        row = {"trait_id": trait_id, "author": author, "r": r, "resolution": res}
        splot_corr = pd.concat([splot_corr, pd.DataFrame([row])])

    return splot_corr

In [None]:
gather_results()

[94m2024-11-12 12:09:27 UTC - src.conf.environment - INFO - Loading and filtering data for X11_bodegom...[0m
[94m2024-11-12 12:09:27 UTC - src.conf.environment - INFO - Joining dataframes (X11_bodegom)...[0m
[94m2024-11-12 12:09:27 UTC - src.conf.environment - INFO - Calculating weights (X11_bodegom)...[0m
[94m2024-11-12 12:09:27 UTC - src.conf.environment - INFO - Calculating weighted Pearson correlation coefficient (X11_bodegom)...[0m
[94m2024-11-12 12:09:27 UTC - src.conf.environment - INFO - Weighted Pearson correlation coefficient: 0.20405069814434493[0m
  splot_corr = pd.concat([splot_corr, pd.DataFrame([row])])
[94m2024-11-12 12:09:27 UTC - src.conf.environment - INFO - Loading and filtering data for X14_moreno...[0m
[94m2024-11-12 12:09:27 UTC - src.conf.environment - INFO - Joining dataframes (X14_moreno)...[0m
[94m2024-11-12 12:09:27 UTC - src.conf.environment - INFO - Calculating weights (X14_moreno)...[0m
[94m2024-11-12 12:09:27 UTC - src.conf.environment -

Unnamed: 0,trait_id,author,r,resolution
0,X11,bodegom,0.204051,2
0,X14,moreno,0.170192,2
0,X14,schiller,0.334315,2
0,X50,butler,0.355791,2
0,X14,vallicrosa,0.259626,2
0,X11,butler,0.268479,2
0,X50,schiller,0.499278,2
0,X14,butler,0.26871,2
0,X50,boonman,0.378582,2
0,X14,boonman,0.134811,2


## Compare COMB maps and others


In [2]:
import numpy as np

pd.set_option("display.max_columns", None)


trait_ids = ["X3117_mean", "X14_mean", "X50_mean"]
all_results = (
    pd.read_parquet("results/all_results.parquet")[
        ["trait_id", "resolution", "trait_set", "pearsonr", "transform"]
    ]
    .query(
        "trait_set == 'splot_gbif' and trait_id in @trait_ids"
        " and resolution.str.contains('km')"
        " and transform == 'power'"
    )
    .rename({"pearsonr": "r"}, axis=1)
    .drop(columns=["trait_set", "transform"])
    .assign(author="COMB")
    # Replace "_mean" with "" in trait_id
    .assign(trait_id=lambda df: df.trait_id.str.replace("_mean", ""))
    .astype({"trait_id": str, "author": str, "resolution": str, "r": np.float64})
)
dtypes = {"trait_id": str, "author": str, "resolution": str, "r": np.float64}
other_prods = pd.read_csv("results/product_comparison.csv", dtype=dtypes).query(
    "resolution.str.contains('km')"
)

In [3]:
merged = pd.concat([all_results, other_prods], ignore_index=True).astype(
    {"resolution": str}
)
merged

Unnamed: 0,trait_id,resolution,r,author
0,X50,1km,0.626219,COMB
1,X3117,1km,0.625903,COMB
2,X14,1km,0.560474,COMB
3,X50,22km,0.675551,COMB
4,X3117,22km,0.650637,COMB
...,...,...,...,...
70,X14,22km,0.307189,wolf
71,X3117,22km,0.379869,moreno
72,X14,1km,0.264727,moreno
73,X14,1km,0.291614,vallicrosa


In [10]:
other_prods_deg = pd.read_csv("results/product_comparison.csv", dtype=dtypes).query(
    "not resolution.str.contains('km')"
)


In [16]:
merged.query("author == 'moreno'")

Unnamed: 0,trait_id,resolution,r,author
15,X14,1km,0.258842,moreno
17,X14,22km,0.30708,moreno
22,X14,55km,0.123524,moreno
38,X14,111km,0.213585,moreno
54,X14,222km,0.206925,moreno


In [4]:
merged["resolution"] = merged.resolution.map(lambda x: int(x.split("km")[0]))
merged

Unnamed: 0,trait_id,resolution,r,author
0,X50,1,0.626219,COMB
1,X3117,1,0.625903,COMB
2,X14,1,0.560474,COMB
3,X50,22,0.675551,COMB
4,X3117,22,0.650637,COMB
...,...,...,...,...
70,X14,22,0.307189,wolf
71,X3117,22,0.379869,moreno
72,X14,1,0.264727,moreno
73,X14,1,0.291614,vallicrosa


In [5]:
# Set resolution as a top-level header and trait_id as the second-level header, where the column values are the r values and the rows are the authors
from src.utils.trait_utils import get_trait_name_from_id


pivot = merged.pivot_table(
    index=["author", "resolution"], columns=["trait_id"], values="r"
)[["X3117", "X14", "X50"]]

# Rearrange the second level of columns

pivot.columns = [get_trait_name_from_id(trait_id)[0] for trait_id in pivot.columns]
# pivot.columns = pd.MultiIndex.from_tuples(
#     [(get_trait_name_from_id(trait_id)[0], res) for trait_id, res in pivot.columns]
# )

author_mapping = {
    "COMB": "COMB",
    "bodegom": "van Bodegom et al., 2014",
    "boonman": "Boonman et al., 2020",
    "butler": "Butler et al., 2017",
    "madani": "Madani et al., 2018",
    "moreno": "Moreno et al., 2018",
    "schiller": "Schiller et al., 2021",
    "vallicrosa": "Vallicrosa et al., 2022",
    "wolf": "Wolf et al., 20222",
}

resolution_mapping = {
    1: "1 km",
    22: "22 km",
    55: "55 km",
    111: "111 km",
    222: "222 km"
}

# The indices are now author, resolution. Map them using the above mappings
pivot.index = pivot.index.map(lambda x: (author_mapping[x[0]], resolution_mapping[x[1]]))

pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,SLA,Leaf N (mass),Leaf N (area)
author,resolution,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
COMB,1 km,0.625903,0.560474,0.626219
COMB,22 km,0.650637,0.615938,0.675551
COMB,55 km,0.631815,0.585532,0.654913
COMB,111 km,0.586428,0.539611,0.604782
COMB,222 km,0.519012,0.461865,0.592721
"van Bodegom et al., 2014",55 km,0.334855,,
"van Bodegom et al., 2014",111 km,0.231922,,
"van Bodegom et al., 2014",222 km,0.236022,,
"Boonman et al., 2020",55 km,0.431195,0.112172,0.437659
"Boonman et al., 2020",111 km,0.372396,0.084981,0.415514


In [6]:
# print(pivot.to_latex(float_format="%.2f", na_rep="--"))
pivot_highlighted = pivot.style.format(precision=2, na_rep="-")
print(pivot_highlighted.to_latex())

\begin{tabular}{llrrr}
 &  & SLA & Leaf N (mass) & Leaf N (area) \\
author & resolution &  &  &  \\
\multirow[c]{5}{*}{COMB} & 1 km & 0.63 & 0.56 & 0.63 \\
 & 22 km & 0.65 & 0.62 & 0.68 \\
 & 55 km & 0.63 & 0.59 & 0.65 \\
 & 111 km & 0.59 & 0.54 & 0.60 \\
 & 222 km & 0.52 & 0.46 & 0.59 \\
\multirow[c]{3}{*}{van Bodegom et al., 2014} & 55 km & 0.33 & - & - \\
 & 111 km & 0.23 & - & - \\
 & 222 km & 0.24 & - & - \\
\multirow[c]{3}{*}{Boonman et al., 2020} & 55 km & 0.43 & 0.11 & 0.44 \\
 & 111 km & 0.37 & 0.08 & 0.42 \\
 & 222 km & 0.37 & 0.12 & 0.41 \\
\multirow[c]{3}{*}{Butler et al., 2017} & 55 km & 0.29 & 0.20 & 0.39 \\
 & 111 km & 0.36 & 0.29 & 0.37 \\
 & 222 km & 0.29 & 0.25 & 0.33 \\
\multirow[c]{3}{*}{Madani et al., 2018} & 55 km & 0.10 & - & - \\
 & 111 km & 0.25 & - & - \\
 & 222 km & 0.25 & - & - \\
\multirow[c]{5}{*}{Moreno et al., 2018} & 1 km & 0.38 & 0.26 & - \\
 & 22 km & 0.38 & 0.31 & - \\
 & 55 km & 0.38 & 0.12 & - \\
 & 111 km & 0.40 & 0.21 & - \\
 & 222 km & 0.44 & 0.

## Compare CIT maps and others


In [7]:
import numpy as np

pd.set_option("display.max_columns", None)


trait_ids = ["X3117_mean", "X14_mean", "X50_mean"]
all_results = (
    pd.read_parquet("results/all_results.parquet")[
        ["trait_id", "resolution", "trait_set", "pearsonr", "transform"]
    ]
    .query(
        "trait_set == 'gbif' and trait_id in @trait_ids"
        " and resolution.str.contains('km')"
        " and transform == 'power'"
    )
    .rename({"pearsonr": "r"}, axis=1)
    .drop(columns=["trait_set", "transform"])
    .assign(author="CIT")
    # Replace "_mean" with "" in trait_id
    .assign(trait_id=lambda df: df.trait_id.str.replace("_mean", ""))
    .astype({"trait_id": str, "author": str, "resolution": str, "r": np.float64})
)
dtypes = {"trait_id": str, "author": str, "resolution": str, "r": np.float64}
other_prods = pd.read_csv("results/product_comparison.csv", dtype=dtypes).query(
    "resolution.str.contains('km')"
)

In [8]:
merged = pd.concat([all_results, other_prods], ignore_index=True).astype(
    {"resolution": str}
)
merged

Unnamed: 0,trait_id,resolution,r,author
0,X50,1km,0.547147,CIT
1,X3117,1km,0.528975,CIT
2,X14,1km,0.487974,CIT
3,X50,22km,0.528902,CIT
4,X3117,22km,0.451353,CIT
...,...,...,...,...
70,X14,22km,0.307189,wolf
71,X3117,22km,0.379869,moreno
72,X14,1km,0.264727,moreno
73,X14,1km,0.291614,vallicrosa


In [9]:
merged["resolution"] = merged.resolution.map(lambda x: int(x.split("km")[0]))
merged

Unnamed: 0,trait_id,resolution,r,author
0,X50,1,0.547147,CIT
1,X3117,1,0.528975,CIT
2,X14,1,0.487974,CIT
3,X50,22,0.528902,CIT
4,X3117,22,0.451353,CIT
...,...,...,...,...
70,X14,22,0.307189,wolf
71,X3117,22,0.379869,moreno
72,X14,1,0.264727,moreno
73,X14,1,0.291614,vallicrosa


In [11]:
# Set resolution as a top-level header and trait_id as the second-level header, where the column values are the r values and the rows are the authors
from src.utils.trait_utils import get_trait_name_from_id


pivot = merged.pivot_table(
    index=["author", "resolution"], columns=["trait_id"], values="r"
)[["X3117", "X14", "X50"]]

# Rearrange the second level of columns

pivot.columns = [get_trait_name_from_id(trait_id)[0] for trait_id in pivot.columns]
# pivot.columns = pd.MultiIndex.from_tuples(
#     [(get_trait_name_from_id(trait_id)[0], res) for trait_id, res in pivot.columns]
# )

author_mapping = {
    "CIT": "This study (CIT)",
    "bodegom": "van Bodegom et al., 2014",
    "boonman": "Boonman et al., 2020",
    "butler": "Butler et al., 2017",
    "madani": "Madani et al., 2018",
    "moreno": "Moreno et al., 2018",
    "schiller": "Schiller et al., 2021",
    "vallicrosa": "Vallicrosa et al., 2022",
    "wolf": "Wolf et al., 20222",
}

resolution_mapping = {
    1: "1 km",
    22: "22 km",
    55: "55 km",
    111: "111 km",
    222: "222 km"
}

# The indices are now author, resolution. Map them using the above mappings
pivot.index = pivot.index.map(lambda x: (author_mapping[x[0]], resolution_mapping[x[1]]))

pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,SLA,Leaf N (mass),Leaf N (area)
author,resolution,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
This study (CIT),1 km,0.528975,0.487974,0.547147
This study (CIT),22 km,0.451353,0.485632,0.528902
This study (CIT),55 km,0.43713,0.472444,0.537008
This study (CIT),111 km,0.419786,0.43911,0.522001
This study (CIT),222 km,0.410106,0.370959,0.514599
"van Bodegom et al., 2014",55 km,0.334855,,
"van Bodegom et al., 2014",111 km,0.231922,,
"van Bodegom et al., 2014",222 km,0.236022,,
"Boonman et al., 2020",55 km,0.431195,0.112172,0.437659
"Boonman et al., 2020",111 km,0.372396,0.084981,0.415514


In [12]:
# print(pivot.to_latex(float_format="%.2f", na_rep="--"))
pivot_highlighted = pivot.style.format(precision=2, na_rep="-")
print(pivot_highlighted.to_latex())

\begin{tabular}{llrrr}
 &  & SLA & Leaf N (mass) & Leaf N (area) \\
author & resolution &  &  &  \\
\multirow[c]{5}{*}{This study (CIT)} & 1 km & 0.53 & 0.49 & 0.55 \\
 & 22 km & 0.45 & 0.49 & 0.53 \\
 & 55 km & 0.44 & 0.47 & 0.54 \\
 & 111 km & 0.42 & 0.44 & 0.52 \\
 & 222 km & 0.41 & 0.37 & 0.51 \\
\multirow[c]{3}{*}{van Bodegom et al., 2014} & 55 km & 0.33 & - & - \\
 & 111 km & 0.23 & - & - \\
 & 222 km & 0.24 & - & - \\
\multirow[c]{3}{*}{Boonman et al., 2020} & 55 km & 0.43 & 0.11 & 0.44 \\
 & 111 km & 0.37 & 0.08 & 0.42 \\
 & 222 km & 0.37 & 0.12 & 0.41 \\
\multirow[c]{3}{*}{Butler et al., 2017} & 55 km & 0.29 & 0.20 & 0.39 \\
 & 111 km & 0.36 & 0.29 & 0.37 \\
 & 222 km & 0.29 & 0.25 & 0.33 \\
\multirow[c]{3}{*}{Madani et al., 2018} & 55 km & 0.10 & - & - \\
 & 111 km & 0.25 & - & - \\
 & 222 km & 0.25 & - & - \\
\multirow[c]{5}{*}{Moreno et al., 2018} & 1 km & 0.38 & 0.26 & - \\
 & 22 km & 0.38 & 0.31 & - \\
 & 55 km & 0.38 & 0.12 & - \\
 & 111 km & 0.40 & 0.21 & - \\
 & 222 km