# 2.0.3: Aggregating model statistics across resolutions

We have now trained models on sPlot, sPlot + GBIF (combined), and GBIF trait data across five resolutions: $0.01\degree$, $0.2\degree$, $0.5\degree$, $1\degree$, and $2\degree$. To make comparison easier, it will be helpful to aggregate all of the model statistics across traits, trait sets, and resolutions.

In the case of this project, however, each resolution and PFT combination is managed in a different branch using Data Version Control (DVC). Therefore, the below script will need to be run for each branch, while the final aggregated stats will be checked into a file tracked universally with Git.

## Imports and config

In [2]:
import pandas as pd

from src.conf.conf import get_config
from src.conf.environment import log

cfg = get_config()

## Model performance

### Create table structure

Sample results file:

In [5]:
from src.utils.dataset_utils import get_model_performance


sample = get_model_performance("X11_mean", "splot_gbif")
sample

Unnamed: 0,r2,pearsonr,pearsonr_wt,root_mean_squared_error,norm_root_mean_squared_error,mean_squared_error,mean_absolute_error,median_absolute_error,transform
0,0.206451,0.458351,0.466838,6.771176,0.179225,45.848823,4.983749,3.824726,none
1,0.226459,0.485136,0.489755,0.344032,0.182446,0.118358,0.257796,0.193528,log


In [7]:
all_results = pd.DataFrame(
    {
        "pft": [],
        "resolution": [],
        "trait_id": [],
        "trait_set": [],
        "automl": [],
        "model_arch": [],
        "run_id": [],
    }
).astype(
    {
        "pft": str,
        "resolution": str,
        "trait_id": str,
        "trait_set": str,
        "automl": bool,
        "model_arch": str,
        "run_id": str,
    }
)

# Add the columns from the sample results DataFrame
for col in sample.columns:
    all_results[col] = pd.Series(dtype=sample[col].dtype)

all_results

Unnamed: 0,pft,resolution,trait_id,trait_set,automl,model_arch,run_id,r2,pearsonr,pearsonr_wt,root_mean_squared_error,norm_root_mean_squared_error,mean_squared_error,mean_absolute_error,median_absolute_error,transform


### Append all existing results in current configuration to `all_results`

In [18]:
from src.utils.dataset_utils import get_latest_run, get_models_dir

all_models = get_models_dir().glob("X*")

for model_dir in all_models:
    trait_id = model_dir.name

    for trait_set_dir in get_latest_run(model_dir / cfg.train.arch).iterdir():
        trait_set = trait_set_dir.name
        trait_df = get_model_performance(trait_id, trait_set).assign(
            pft=cfg.PFT,
            resolution=cfg.model_res,
            trait_id=trait_id,
            automl=cfg.train.arch == "autogluon",
            model_arch=cfg[cfg.train.arch].included_model_types[0],
            run_id=trait_set_dir.parent.name,
            trait_set=trait_set,
        )[all_results.columns]

        all_results = pd.concat(
            [all_results, trait_df], ignore_index=True
        ).drop_duplicates()

all_results

Unnamed: 0,pft,resolution,trait_id,trait_set,automl,model_arch,run_id,r2,pearsonr,pearsonr_wt,root_mean_squared_error,norm_root_mean_squared_error,mean_squared_error,mean_absolute_error,median_absolute_error,transform
0,Shrub_Tree_Grass,001,X163_mean,splot,True,GBM,20240827_032644,0.168091,0.414057,0.440022,0.367905,0.211601,0.135354,0.204053,0.141102,none
1,Shrub_Tree_Grass,001,X163_mean,splot,True,GBM,20240827_032644,0.233690,0.503460,0.525381,0.188455,0.193087,0.035515,0.134164,0.103798,log
2,Shrub_Tree_Grass,001,X163_mean,gbif,True,GBM,20240827_032644,-0.082184,0.288847,0.308069,0.419613,0.241340,0.176075,0.290659,0.239805,none
3,Shrub_Tree_Grass,001,X163_mean,gbif,True,GBM,20240827_032644,-0.216775,0.360967,0.375796,0.242961,0.243514,0.059030,0.196692,0.175843,log
4,Shrub_Tree_Grass,001,X163_mean,splot_gbif,True,GBM,20240827_032644,0.135023,0.393680,0.418676,0.375146,0.215765,0.140734,0.224991,0.168629,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,Shrub_Tree_Grass,001,X144_mean,splot,True,GBM,20240819_103918,0.278095,0.557968,0.550733,0.440050,0.179920,0.193644,0.322025,0.237315,log
194,Shrub_Tree_Grass,001,X144_mean,gbif,True,GBM,20240819_103918,0.049655,0.225602,0.227758,57.810262,0.199103,3342.026338,38.769266,29.106018,none
195,Shrub_Tree_Grass,001,X144_mean,gbif,True,GBM,20240819_103918,-0.016695,0.285496,0.287025,0.724787,0.206590,0.525317,0.544385,0.409335,log
196,Shrub_Tree_Grass,001,X144_mean,splot_gbif,True,GBM,20240819_103918,0.049794,0.229760,0.231499,56.850330,0.198381,3231.959985,37.949664,28.156664,none


## Feature importance

### Create table structure

Load sample table:

In [20]:
from src.utils.dataset_utils import get_feature_importance


sample = get_feature_importance("X11_mean", "splot_gbif")
sample

Unnamed: 0_level_0,importance,importance,stddev,stddev,p_value,p_value,n,n,p99_high,p99_high,p99_low,p99_low
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
ETH_GlobalCanopyHeightSD_2020_v1,0.002810,0.009601,0.003374,0.000782,2.900449e-01,4.217149e-01,10.0,0.0,0.006277,0.009421,-0.000657,0.009843
ETH_GlobalCanopyHeight_2020_v1,0.016084,0.004352,0.005136,0.001464,3.163021e-06,2.343513e-06,10.0,0.0,0.021362,0.005680,0.010806,0.003184
bdod_0-5cm_mean,0.033002,0.040830,0.007024,0.004979,1.113295e-01,2.489308e-01,10.0,0.0,0.040221,0.045927,0.025783,0.035739
bdod_100-200cm_mean,0.007192,0.005874,0.003563,0.001358,2.374703e-02,5.257190e-02,10.0,0.0,0.010854,0.006873,0.003530,0.005067
bdod_15-30cm_mean,0.017631,0.008337,0.004394,0.001110,9.335536e-05,2.054296e-04,10.0,0.0,0.022147,0.008726,0.013115,0.008091
...,...,...,...,...,...,...,...,...,...,...,...,...
wc2.1_30s_bio_12,0.088691,0.071282,0.009117,0.004976,4.211640e-07,9.383140e-07,10.0,0.0,0.098061,0.075759,0.079322,0.066898
wc2.1_30s_bio_13-14,0.023974,0.016801,0.006007,0.001610,2.102005e-03,4.698506e-03,10.0,0.0,0.030147,0.018153,0.017800,0.015508
wc2.1_30s_bio_15,0.014512,0.016222,0.006704,0.002839,2.148576e-01,4.107055e-01,10.0,0.0,0.021401,0.014258,0.007623,0.018441
wc2.1_30s_bio_4,0.073713,0.092049,0.014308,0.002409,4.358978e-04,5.935139e-04,10.0,0.0,0.088418,0.093590,0.059009,0.090549


This will require a bit more finagling due to the use of multi-headers.

Create a new dataframe with our trait and configuration columns.

In [22]:
all_importances = all_results.iloc[0:0].copy()[all_results.columns[:7]]

Unnamed: 0,pft,resolution,trait_id,trait_set,automl,model_arch,run_id


Melt the sample feature importance dataframe to flatten the multi-headers. Also reset the index as we don't really need the features to serve as indices.

In [32]:
sample.reset_index()#.stack(future_stack=True)

Unnamed: 0_level_0,index,importance,importance,stddev,stddev,p_value,p_value,n,n,p99_high,p99_high,p99_low,p99_low
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
0,ETH_GlobalCanopyHeightSD_2020_v1,0.002810,0.009601,0.003374,0.000782,2.900449e-01,4.217149e-01,10.0,0.0,0.006277,0.009421,-0.000657,0.009843
1,ETH_GlobalCanopyHeight_2020_v1,0.016084,0.004352,0.005136,0.001464,3.163021e-06,2.343513e-06,10.0,0.0,0.021362,0.005680,0.010806,0.003184
2,bdod_0-5cm_mean,0.033002,0.040830,0.007024,0.004979,1.113295e-01,2.489308e-01,10.0,0.0,0.040221,0.045927,0.025783,0.035739
3,bdod_100-200cm_mean,0.007192,0.005874,0.003563,0.001358,2.374703e-02,5.257190e-02,10.0,0.0,0.010854,0.006873,0.003530,0.005067
4,bdod_15-30cm_mean,0.017631,0.008337,0.004394,0.001110,9.335536e-05,2.054296e-04,10.0,0.0,0.022147,0.008726,0.013115,0.008091
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,wc2.1_30s_bio_12,0.088691,0.071282,0.009117,0.004976,4.211640e-07,9.383140e-07,10.0,0.0,0.098061,0.075759,0.079322,0.066898
146,wc2.1_30s_bio_13-14,0.023974,0.016801,0.006007,0.001610,2.102005e-03,4.698506e-03,10.0,0.0,0.030147,0.018153,0.017800,0.015508
147,wc2.1_30s_bio_15,0.014512,0.016222,0.006704,0.002839,2.148576e-01,4.107055e-01,10.0,0.0,0.021401,0.014258,0.007623,0.018441
148,wc2.1_30s_bio_4,0.073713,0.092049,0.014308,0.002409,4.358978e-04,5.935139e-04,10.0,0.0,0.088418,0.093590,0.059009,0.090549
