# 2.0.0: Trait map correlation with sPlot sparse grids

On a global scale, sPlot is one of the best benchmarks we have when evaluating the accuracy of trait extrapolations. A simple way of evaluating the quality of our models is to calculate the correlation between the extrapolated trait values and the corresponding gridded sPlot trait values using Pearson's correlation coefficient. However, since our values are stored in geographic coordinates, we should make sure to weight each value according to its grid cell's actual area on Earth.

## Imports and config

In [1]:
from pathlib import Path

import pandas as pd
import rioxarray as riox
import statsmodels.api as sm
import xarray as xr
from pyproj import Proj
from shapely.geometry import shape

from src.conf.conf import get_config
from src.conf.environment import log

cfg = get_config()

## Define latitude weights

In [7]:
def lat_weights(lat_unique: pd.Series, deg: int | float) -> dict:
    """Calculate weights for each latitude band based on area of grid cells."""

    weights = {}

    for j in lat_unique:
        p1 = (0, j + (deg / 2))
        p2 = (deg, j + (deg / 2))
        p3 = (deg, j - (deg / 2))
        p4 = (0, j - (deg / 2))
        co = {"type": "Polygon", "coordinates": [[p1, p2, p3, p4]]}

        lat_1 = p1[1]
        lat_2 = p3[1] if abs(p1[1] + p3[1]) >= abs(p1[1]) + abs(p3[1]) else 0
        lat_0 = (lat_1 + lat_2) / 2
        lon_0 = deg / 2

        print(lat_1, lat_2, lat_0, lon_0)

        projection_string = (
            f"+proj=aea +lat_1={lat_1} +lat_2={lat_2} +lat_0={lat_0} +lon_0={lon_0}"
        )
        pa = Proj(projection_string)
        lon, lat = zip(*co["coordinates"][0])
        x, y = pa(lon, lat)
        cop = {"type": "Polygon", "coordinates": [list(zip(x, y))]}

        area = shape(cop).area / 1000000  # Convert to square kilometers
        weights[j] = area

    max_area = max(weights.values())
    weights = {k: v / max_area for k, v in weights.items()}

    return weights

In [11]:
def lat_weights(lat_unique: pd.Series, deg: int | float) -> dict:
    """Calculate weights for each latitude band based on area of grid cells."""
    weights = {}

    for j in lat_unique:
        # Define the corners of the grid cell
        p1 = (0, j + (deg / 2))
        p2 = (deg, j + (deg / 2))
        p3 = (deg, j - (deg / 2))
        p4 = (0, j - (deg / 2))
        co = {"type": "Polygon", "coordinates": [[p1, p2, p3, p4]]}

        # Use the average latitude for the projection's central latitude
        lat_0 = j
        lon_0 = deg / 2

        # Define the projection string using the average latitude
        projection_string = (
            f"+proj=aea +lat_1={p1[1]} +lat_2={p3[1]} +lat_0={lat_0} +lon_0={lon_0}"
        )
        pa = Proj(projection_string)
        lon, lat = zip(*co["coordinates"][0])
        x, y = pa(lon, lat)
        cop = {"type": "Polygon", "coordinates": [list(zip(x, y))]}

        # Calculate the area of the grid cell in square kilometers
        area = shape(cop).area / 1000000
        weights[j] = area

    # Normalize the weights by the maximum area
    max_area = max(weights.values())
    weights = {k: v / max_area for k, v in weights.items()}

    return weights

## Calculate weighted $r$

In [3]:
def weighted_pearson_r(df: pd.DataFrame, weights: dict) -> float:
    """Calculate the weighted Pearson correlation coefficient between two DataFrames."""

    df["weights"] = df.index.get_level_values("y").map(weights)

    model = sm.stats.DescrStatsW(df.iloc[:, :2], df["weights"])
    return model.corrcoef[0, 1]

## Load the data

In [4]:
splot_fns = sorted(
    list(
        Path(
            cfg.interim_dir,
            cfg.splot.interim.dir,
            cfg.splot.interim.traits,
            cfg.PFT,
            cfg.model_res,
        ).glob("*.tif")
    ),
    key=lambda x: int(x.stem.split("X")[-1]),
)
extrap_fns = sorted(
    list(
        Path(
            cfg.processed.dir,
            cfg.PFT,
            cfg.model_res,
            cfg.datasets.Y.use,
            cfg.processed.predict_dir,
        ).glob("*.tif")
    ),
    key=lambda x: int(x.stem.split("_")[0].split("X")[-1]),
)

In [12]:
# nchunks = 6
# x_chunks = (360 / cfg.target_resolution) // nchunks
# y_chunks = (180 / cfg.target_resolution) // nchunks
# chunks = {"x": x_chunks, "y": y_chunks}

for splot_fn, extrap_fn in zip(splot_fns[:1], extrap_fns[:1]):
    log.info("Loading and filtering data...")
    splot = (
        riox.open_rasterio(splot_fn)
        .sel(band=1)
        .to_dataframe(name=f"splot_{splot_fn.stem}")
        .drop(columns=["band", "spatial_ref"])
        .dropna()
    )
    extrap = (
        riox.open_rasterio(extrap_fn)
        .sel(band=1)
        .to_dataframe(name=f"extrap_{extrap_fn.stem}")
        .drop(columns=["band", "spatial_ref"])
        .dropna()
    )
    log.info("Joining dataframes...")
    df = splot.join(extrap, how="inner")

    lat_unique = df.index.get_level_values("y").unique().values()

    log.info("Calculating weights...")
    weights = lat_weights(lat_unique, cfg.target_resolution)

    log.info("Calculating weighted Pearson correlation coefficient...")
    r = weighted_pearson_r(df, weights)
    log.info(f"Weighted Pearson correlation coefficient: {r}")

2024-06-17 14:43:21 UTC - src.conf.environment - INFO - Loading and filtering data...


2024-06-17 14:43:48 UTC - src.conf.environment - INFO - Joining dataframes...
2024-06-17 14:44:20 UTC - src.conf.environment - INFO - Calculating weights...
2024-06-17 14:44:23 UTC - src.conf.environment - INFO - Calculating weighted Pearson correlation coefficient...
2024-06-17 14:44:23 UTC - src.conf.environment - INFO - Weighted Pearson correlation coefficient: 0.5984358563948847
