# 1.1.0: Train trait models with Autogluon

## Imports and config

In [1]:
import logging
from pathlib import Path
import pickle

from autogluon.tabular import TabularDataset, TabularPredictor
import dask.dataframe as dd
from dask.distributed import Client
import numpy as np
import pandas as pd


from src.conf.conf import get_config
from src.conf.environment import log
from src.utils.log_utils import get_loggers_starting_with

pd.set_option("display.max_columns", None)

cfg = get_config()

train_dir = Path(cfg.train.dir) / cfg.PFT / cfg.model_res / cfg.datasets.Y.use

## Load the data

In [2]:
with Client(dashboard_address=cfg.dask_dashboard, n_workers=80):
    dask_loggers = get_loggers_starting_with("distributed")
    for logger_name in dask_loggers:
        logging.getLogger(logger_name).setLevel("WARNING")

    feats = (
        dd.read_parquet(train_dir / cfg.train.features)
        .drop(columns=["x", "y"])
    )
    Y_cols = feats.columns[feats.columns.str.startswith("X")].to_list()
    X_cols = feats.columns[~feats.columns.str.startswith("X")].to_list()

    # Select all X_cols and first entry of Y_cols from feats
    Xy = feats[X_cols + Y_cols[:1]].compute().reset_index(drop=True)

## Add CV fold IDs to data

In [3]:
# Load the CV splits
with open(train_dir / cfg.train.cv_splits.dir / f"{Y_cols[0]}.pkl", "rb") as f:
    cv_splits = pickle.load(f)

# Each split is a tuple of (train_idx, valid_idx). Assign the split number to each set
# of valid_idx in Xy
for i, (_, valid_idx) in enumerate(cv_splits):
    Xy.loc[valid_idx, "split"] = i

## Train with Autogluon

In [4]:
# Randomly sample 90% of the data for training
train_idx = Xy.sample(frac=0.9, random_state=42).index
test_idx = Xy.index.difference(train_idx)

In [13]:
train_data = TabularDataset(Xy.loc[train_idx]).sample(
    frac=0.01, random_state=cfg.random_seed
)
test_data = TabularDataset(Xy.loc[test_idx].drop(columns=["split"])).sample(
    frac=0.01, random_state=cfg.random_seed
)

In [4]:
hyperparameters = {
    "NN_TORCH": {},
    "GBM": [
        {"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
        {},
        "GBMLarge",
    ],
    "CAT": {},
    "XGB": {},
    "FASTAI": {},
    "RF": [
        {
            "criterion": "gini",
            "ag_args": {
                "name_suffix": "Gini",
                "problem_types": ["binary", "multiclass"],
            },
        },
        {
            "criterion": "entropy",
            "ag_args": {
                "name_suffix": "Entr",
                "problem_types": ["binary", "multiclass"],
            },
        },
        {
            "criterion": "squared_error",
            "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]},
        },
    ],
    "XT": [
        {
            "criterion": "gini",
            "ag_args": {
                "name_suffix": "Gini",
                "problem_types": ["binary", "multiclass"],
            },
        },
        {
            "criterion": "entropy",
            "ag_args": {
                "name_suffix": "Entr",
                "problem_types": ["binary", "multiclass"],
            },
        },
        {
            "criterion": "squared_error",
            "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]},
        },
    ],
    "KNN": [
        {"weights": "uniform", "ag_args": {"name_suffix": "Unif"}, "n_jobs": 16},
        {"weights": "distance", "ag_args": {"name_suffix": "Dist"}, "n_jobs": 16},
    ],
}

In [7]:
predictor = TabularPredictor(label=Y_cols[0], groups="split").fit(
    train_data, num_bag_folds=10, excluded_model_types=["KNN"], num_gpus=4
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240521_141509"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Values in column 'split' used as split folds instead of being automatically set. Bagged models will have 10 splits.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240521_141509"
AutoGluon Version:  1.1.0
Python Version:     3.11.9
Operating System:   Linux
Platform Machine:   x86_64
Platform V

	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) :   6 | ['wc2.1_30s_bio_1', 'wc2.1_30s_bio_12', 'wc2.1_30s_bio_13-14', 'wc2.1_30s_bio_15', 'wc2.1_30s_bio_4', ...]
		('int', [])   : 144 | ['ETH_GlobalCanopyHeightSD_2020_v1', 'ETH_GlobalCanopyHeight_2020_v1', 'sur_refl_b01_2001-2024_m10_mean', 'sur_refl_b01_2001-2024_m11_mean', 'sur_refl_b01_2001-2024_m12_mean', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) :   6 | ['wc2.1_30s_bio_1', 'wc2.1_30s_bio_12', 'wc2.1_30s_bio_13-14', 'wc2.1_30s_bio_15', 'wc2.1_30s_bio_4', ...]
		('int', [])   : 144 | ['ETH_GlobalCanopyHeightSD_2020_v1', 'ETH_GlobalCanopyHeight_2020_v1', 'sur_ref

In [10]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1079,root_mean_squared_error,18.821038,373.204322,0.000403,0.025472,2,True,10
1,NeuralNetFastAI_BAG_L1,-0.108403,root_mean_squared_error,0.393623,55.885975,0.393623,55.885975,1,True,6
2,LightGBMXT_BAG_L1,-0.108462,root_mean_squared_error,0.044333,3.201824,0.044333,3.201824,1,True,1
3,CatBoost_BAG_L1,-0.108561,root_mean_squared_error,0.07498,11.999588,0.07498,11.999588,1,True,4
4,LightGBM_BAG_L1,-0.108596,root_mean_squared_error,0.041386,3.025867,0.041386,3.025867,1,True,2
5,NeuralNetTorch_BAG_L1,-0.108671,root_mean_squared_error,1.016577,83.932865,1.016577,83.932865,1,True,8
6,LightGBMLarge_BAG_L1,-0.108709,root_mean_squared_error,0.057727,5.448931,0.057727,5.448931,1,True,9
7,XGBoost_BAG_L1,-0.108857,root_mean_squared_error,0.251911,3.766204,0.251911,3.766204,1,True,7
8,ExtraTreesMSE_BAG_L1,-0.108963,root_mean_squared_error,1.472878,37.564896,1.472878,37.564896,1,True,5
9,RandomForestMSE_BAG_L1,-0.109122,root_mean_squared_error,15.763465,175.320727,15.763465,175.320727,1,True,3


In [14]:
y_true = test_data[Y_cols[0]]
y_pred = predictor.predict(test_data.drop(columns=[Y_cols[0]]))

In [16]:
performance = predictor.evaluate_predictions(y_true, y_pred)
performance

{'root_mean_squared_error': -0.10883326581256995,
 'mean_squared_error': -0.01184467974742951,
 'mean_absolute_error': -0.08415142601027989,
 'r2': 0.23889501824464554,
 'pearsonr': 0.49034440832120874,
 'median_absolute_error': -0.06702506434064356}