# 1.1.0: Train trait models with Autogluon

## Imports and config

In [57]:
import logging
from pathlib import Path
import pickle

import dask.dataframe as dd
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from dask.distributed import Client

from src.conf.conf import get_config
from src.conf.environment import log
from src.models.autogluon import evaluate_model
from src.utils.dataset_utils import get_train_dir, get_cv_splits
from src.utils.log_utils import get_loggers_starting_with

pd.set_option("display.max_columns", None)

cfg = get_config()

train_dir = get_train_dir(cfg)

## Load the data

In [58]:
with Client(dashboard_address=cfg.dask_dashboard):
    dask_loggers = get_loggers_starting_with("distributed")
    for logger_name in dask_loggers:
        logging.getLogger(logger_name).setLevel("WARNING")

    feats = (
        dd.read_parquet(train_dir / cfg.train.features)
        .drop(columns=["x", "y"])
    )
    Y_cols = feats.columns[feats.columns.str.startswith("X")].to_list()
    X_cols = feats.columns[~feats.columns.str.startswith("X")].to_list()

    # Select all X_cols and first entry of Y_cols from feats
    Xy = feats[X_cols + Y_cols[:1]].compute().reset_index(drop=True)

## Add CV fold IDs to data

In [59]:
# Load the CV splits
cv_splits = get_cv_splits(cfg, Y_cols[0])

# Each split is a tuple of (train_idx, valid_idx). Assign the split number to each set
# of valid_idx in Xy
for i, (_, valid_idx) in enumerate(cv_splits):
    Xy.loc[valid_idx, "split"] = i

## Train with Autogluon

In [60]:
# Randomly sample 90% of the data for training
train_idx = Xy.sample(frac=0.9, random_state=42).index
test_idx = Xy.index.difference(train_idx)

In [65]:
np.random.seed(cfg.random_seed)
random_split_id = np.random.choice(Xy["split"].unique())

train = TabularDataset(Xy[Xy["split"] != random_split_id])
val = TabularDataset(Xy[Xy["split"] == random_split_id])

These are the available models and their default hyperparameters. Note that AutoGluon does perform hyperparameter tuning during training (though this can also be controlled using either presets or by providing other kwargs).

In [4]:
hyperparameters = {
    "NN_TORCH": {},
    "GBM": [
        {"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
        {},
        "GBMLarge",
    ],
    "CAT": {},
    "XGB": {},
    "FASTAI": {},
    "RF": [
        {
            "criterion": "gini",
            "ag_args": {
                "name_suffix": "Gini",
                "problem_types": ["binary", "multiclass"],
            },
        },
        {
            "criterion": "entropy",
            "ag_args": {
                "name_suffix": "Entr",
                "problem_types": ["binary", "multiclass"],
            },
        },
        {
            "criterion": "squared_error",
            "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]},
        },
    ],
    "XT": [
        {
            "criterion": "gini",
            "ag_args": {
                "name_suffix": "Gini",
                "problem_types": ["binary", "multiclass"],
            },
        },
        {
            "criterion": "entropy",
            "ag_args": {
                "name_suffix": "Entr",
                "problem_types": ["binary", "multiclass"],
            },
        },
        {
            "criterion": "squared_error",
            "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]},
        },
    ],
    "KNN": [
        {"weights": "uniform", "ag_args": {"name_suffix": "Unif"}, "n_jobs": 16},
        {"weights": "distance", "ag_args": {"name_suffix": "Dist"}, "n_jobs": 16},
    ],
}

In [68]:
predictor = TabularPredictor(label=Y_cols[0], groups="split").fit(
    train,
    # num_bag_folds=10,
    excluded_model_types=cfg.autogluon.exclude_models,
    presets="medium",
    # num_cpus=cfg.autogluon.num_cpus,
    num_gpus=cfg.autogluon.num_gpus,
    feature_prune_kwargs={},
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240625_115926"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.9
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #117-Ubuntu SMP Fri Apr 26 12:26:49 UTC 2024
CPU Count:          128
Memory Avail:       724.39 GB / 755.20 GB (95.9%)
Disk Space Avail:   12076.37 GB / 13100.23 GB (92.2%)
Presets specified: ['medium']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Values in column 'split' used as split folds instead of being automatically set. Bagged models will have 9 splits.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240625_115926"
Train Data Rows:    4976511
Train Data Columns: 151
Label Column:       X4_mea

KeyboardInterrupt: 

In [35]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.107163,root_mean_squared_error,2.714521,220.201701,0.000433,0.025853,2,True,9
1,LightGBMXT_BAG_L1,-0.107625,root_mean_squared_error,0.058827,3.316332,0.058827,3.316332,1,True,1
2,CatBoost_BAG_L1,-0.10766,root_mean_squared_error,0.071197,9.443282,0.071197,9.443282,1,True,3
3,LightGBM_BAG_L1,-0.107767,root_mean_squared_error,0.04506,3.2577,0.04506,3.2577,1,True,2
4,NeuralNetFastAI_BAG_L1,-0.107785,root_mean_squared_error,0.483887,60.706058,0.483887,60.706058,1,True,5
5,LightGBMLarge_BAG_L1_Prune,-0.107801,root_mean_squared_error,0.066785,5.625227,0.066785,5.625227,1,True,8
6,XGBoost_BAG_L1,-0.107895,root_mean_squared_error,0.337813,4.818901,0.337813,4.818901,1,True,6
7,NeuralNetTorch_BAG_L1,-0.107905,root_mean_squared_error,0.983095,92.108356,0.983095,92.108356,1,True,7
8,ExtraTreesMSE_BAG_L1,-0.108023,root_mean_squared_error,0.712485,44.157691,0.712485,44.157691,1,True,4


In [36]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesMSE_BAG_L1,-0.108098,-0.108023,root_mean_squared_error,3.179904,0.712485,44.157691,3.179904,0.712485,44.157691,1,True,4
1,WeightedEnsemble_L2,-0.10819,-0.107163,root_mean_squared_error,4.893372,2.714521,220.201701,0.001884,0.000433,0.025853,2,True,9
2,LightGBMLarge_BAG_L1_Prune,-0.108258,-0.107801,root_mean_squared_error,0.061005,0.066785,5.625227,0.061005,0.066785,5.625227,1,True,8
3,LightGBMXT_BAG_L1,-0.108488,-0.107625,root_mean_squared_error,0.048053,0.058827,3.316332,0.048053,0.058827,3.316332,1,True,1
4,NeuralNetFastAI_BAG_L1,-0.108526,-0.107785,root_mean_squared_error,0.483938,0.483887,60.706058,0.483938,0.483887,60.706058,1,True,5
5,XGBoost_BAG_L1,-0.108608,-0.107895,root_mean_squared_error,0.197354,0.337813,4.818901,0.197354,0.337813,4.818901,1,True,6
6,LightGBM_BAG_L1,-0.108621,-0.107767,root_mean_squared_error,0.040969,0.04506,3.2577,0.040969,0.04506,3.2577,1,True,2
7,NeuralNetTorch_BAG_L1,-0.108678,-0.107905,root_mean_squared_error,0.872125,0.983095,92.108356,0.872125,0.983095,92.108356,1,True,7
8,CatBoost_BAG_L1,-0.108816,-0.10766,root_mean_squared_error,0.049111,0.071197,9.443282,0.049111,0.071197,9.443282,1,True,3


In [40]:
predictor.refit_full("best", train_data_extra=test_data)

Refitting models via `predictor.refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix "_FULL" and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `predictor.fit` call.
	To learn more, refer to the `.refit_full` method docstring which explains how "_FULL" models differ from normal models.


Fitting 1 L1 models ...
Fitting model: LightGBMXT_BAG_L1_FULL ...
	1.22s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: CatBoost_BAG_L1_FULL ...
	1.8s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: ExtraTreesMSE_BAG_L1_FULL ...
	4.68s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: NeuralNetFastAI_BAG_L1_FULL ...
	Stopping at the best epoch learned earlier - 12.
	12.95s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: XGBoost_BAG_L1_FULL ...
	0.67s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: NeuralNetTorch_BAG_L1_FULL ...
	13.11s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: LightGBMLarge_BAG_L1_Prune_FULL ...
	1.97s	 = Training   runtime
Fitting model: WeightedEnsemble_L2_FULL | Skipping fit via cloning parent ...
	Ensemble Weights: {'NeuralNetFastAI_BAG_L1': 0.263, 'NeuralNetTorch_BAG_L1': 0.263, 'CatBoost_BAG_L1': 0.158, 'ExtraTreesMSE_BAG_L1': 0.105, 'LightGBMLarge_BAG_L1_Prune': 0.105, 'LightGBMXT

{'LightGBMXT_BAG_L1': 'LightGBMXT_BAG_L1_FULL',
 'CatBoost_BAG_L1': 'CatBoost_BAG_L1_FULL',
 'ExtraTreesMSE_BAG_L1': 'ExtraTreesMSE_BAG_L1_FULL',
 'NeuralNetFastAI_BAG_L1': 'NeuralNetFastAI_BAG_L1_FULL',
 'XGBoost_BAG_L1': 'XGBoost_BAG_L1_FULL',
 'NeuralNetTorch_BAG_L1': 'NeuralNetTorch_BAG_L1_FULL',
 'LightGBMLarge_BAG_L1_Prune': 'LightGBMLarge_BAG_L1_Prune_FULL',
 'WeightedEnsemble_L2': 'WeightedEnsemble_L2_FULL'}

In [29]:
performance = evaluate_model(
    predictor,
    train_data[Y_cols[0]],
    predictor.predict_oof(train_data=train_data),
    train_data["split"],
)

performance

Unnamed: 0,root_mean_squared_error,mean_squared_error,mean_absolute_error,r2,pearsonr,median_absolute_error,norm_root_mean_squared_error
mean,-0.107447,-0.011552,-0.082995,0.209409,0.45676,-0.065802,-0.138098
std,0.002832,0.000618,0.00229,0.043137,0.048282,0.00241,0.00364


In [33]:
predictor.feature_importance(data=test_data)

Computing feature importance via permutation shuffling for 150 features using 5000 rows with 5 shuffle sets...
	6515.61s	= Expected runtime (1303.12s per shuffle set)


KeyboardInterrupt: 