# 2.0.3: Generating stats for public sharing

When sharing any scientific product externally, it is vital for building trust and for increasing user understanding to include error metrics such as model performance. Here we aggregate some key statistics such as $R^2$ and the Pearson correlation coefficient $r$ with sPlot traits to enable users of our trait maps to determine how much confidence they should have when integrating them into their own work.

## Imports and config

In [1]:
from pathlib import Path

import pandas as pd

from src.conf.conf import get_config
from src.conf.environment import log
from src.utils.dataset_utils import get_splot_corr_fn

cfg = get_config()

## sPlot correlation

In [9]:
splot_corr = pd.read_csv(get_splot_corr_fn(cfg))

Print in markdown format

In [7]:
splot_corr = splot_corr.sort_values(by="trait", key=lambda x: x.str.split("X").str[1].str.split("_").str[0].astype(int))
splot_corr

Unnamed: 0,trait,r
27,X4_mean,0.571463
30,X6_mean,0.56649
1,X11_mean,0.601323
3,X13_mean,0.464824
7,X14_mean,0.555827
8,X15_mean,0.557473
11,X18_mean,0.469383
12,X21_mean,0.390792
16,X26_mean,0.229498
17,X27_mean,0.412792


In [6]:
# Trait values are in format "X<number>_mean", e.g. "X154_mean". Sort them by the number.
print(splot_corr.r.to_markdown(index=False, floatfmt=".2f"))

|    r |
|-----:|
| 0.57 |
| 0.57 |
| 0.60 |
| 0.46 |
| 0.56 |
| 0.56 |
| 0.47 |
| 0.39 |
| 0.23 |
| 0.41 |
| 0.44 |
| 0.45 |
| 0.63 |
| 0.46 |
| 0.62 |
| 0.42 |
| 0.40 |
| 0.51 |
| 0.57 |
| 0.58 |
| 0.41 |
| 0.61 |
| 0.33 |
| 0.43 |
| 0.39 |
| 0.44 |
| 0.43 |
| 0.32 |
| 0.56 |
| 0.51 |
| 0.46 |
| 0.49 |
| 0.43 |


## $R^2$ and $RMSE$

In [27]:
from src.utils.autogluon_utils import get_best_model_ag
from src.utils.dataset_utils import get_models_dir


traits = get_models_dir(cfg)

results = []

for i, trait in enumerate(traits.iterdir()):
    if not trait.is_dir():
        continue
    best_model = get_best_model_ag(trait)
    eval_results = pd.read_csv(best_model / cfg.train.eval_results)
    results.append([trait.stem, abs(eval_results.r2.iloc[0]), abs(eval_results.norm_root_mean_squared_error.iloc[0])])

results = pd.DataFrame(results, columns=["trait", "r2", "norm_rmse"])
results = results.sort_values(by="trait", key=lambda x: x.str.split("X").str[1].str.split("_").str[0].astype(int))
results

Unnamed: 0,trait,r2,norm_rmse
6,X4_mean,0.21031,0.135603
18,X6_mean,0.245935,0.066348
26,X11_mean,0.246041,0.111703
22,X13_mean,0.116414,0.114993
27,X14_mean,0.148676,0.135951
20,X15_mean,0.201462,0.116903
2,X18_mean,0.172379,0.118602
10,X21_mean,0.113808,0.071028
8,X26_mean,0.150879,0.0528
1,X27_mean,0.259003,0.089846


In [28]:
print(results[["r2", "norm_rmse"]].to_markdown(index=False, floatfmt=".2f"))

|   r2 |   norm_rmse |
|-----:|------------:|
| 0.21 |        0.14 |
| 0.25 |        0.07 |
| 0.25 |        0.11 |
| 0.12 |        0.11 |
| 0.15 |        0.14 |
| 0.20 |        0.12 |
| 0.17 |        0.12 |
| 0.11 |        0.07 |
| 0.15 |        0.05 |
| 0.26 |        0.09 |
| 0.21 |        0.11 |
| 0.19 |        0.16 |
| 0.24 |        0.11 |
| 0.15 |        0.05 |
| 0.20 |        0.12 |
| 0.04 |        0.12 |
| 0.02 |        0.04 |
| 0.07 |        0.10 |
| 0.15 |        0.09 |
| 0.10 |        0.06 |
| 0.11 |        0.06 |
| 0.19 |        0.09 |
| 0.06 |        0.10 |
| 0.16 |        0.13 |
| 0.23 |        0.10 |
| 0.06 |        0.10 |
| 0.06 |        0.09 |
| 0.06 |        0.12 |
| 0.12 |        0.11 |
| 0.14 |        0.06 |
| 0.10 |        0.08 |
| 0.12 |        0.07 |
| 0.15 |        0.12 |
