# Benchmarking Tabular ML Datasets
Thom, Jakob and Marit

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

## Load in Data

In [2]:
def load_df(filename, foldername='aml-2025-benchmarking-tabular-ml-datasets'):
    return pd.read_csv(f'{foldername}/{filename}', header=0)

In [3]:
covtype_test = load_df('covtype_test.csv')
covtype_train = load_df('covtype_train.csv')
heloc_test = load_df('heloc_test.csv')
heloc_train = load_df('heloc_train.csv')
higgs_test = load_df('higgs_test.csv')
higgs_train = load_df('higgs_train.csv')

# Make all target columns have the name 'target'
covtype_train.rename(columns={'Cover_Type' : 'label'}, inplace=True)
heloc_train.rename(columns={'RiskPerformance' : 'label'}, inplace=True)
higgs_train.rename(columns={'Label' : 'label'}, inplace=True)

In [4]:
tables_test = [covtype_test, heloc_test, higgs_test]
tables_train = [covtype_train, heloc_train, higgs_train]
names = ['CoverType', 'HELOC', 'Higgs']

## Combine and clean datasets


In [37]:
# Labels to convert
binary_labels = {
    'Bad': 1,
    'Good': 0,
    's': 1,
    'b': 0
    }

def clean_and_combine(tables, names, binary_labels = None):
    
    cleaned_tables = []

    for table, name in zip(tables, names):
        t = table.copy()

        # Get numerical columns for this specific table
        numerical_cols = t.select_dtypes(include=np.number).columns.tolist()

        # Clean missing values based on domain and remove id and weight column
        if name == 'HELOC':
            for col in numerical_cols:
                t.loc[t[col] < 0, col] = np.nan

        elif name == 'HIGGS':
            t.replace(-999.0, np.nan, inplace=True)

        # Add domain name
        t['Domain'] = name
        cleaned_tables.append(t)
        
    unified_df = pd.concat(cleaned_tables, ignore_index=True).drop(['EventId', 'Weight'], axis=1)

    # Handle target labels if provided (Training Data)
    if binary_labels:
        unified_df['label'] = unified_df['label'].astype(str).replace(binary_labels)       # As string first to prevent downcasting warning
        unified_df['label'] = unified_df['label'].astype(int)
    return unified_df

## Parameter search on lightgbm

In [None]:
def lightgbm_params(df, random_state=42, test_size=0.2, target_col='label'):
    
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Get numerical columns
    numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
    
    categorical_cols = ['Domain']

    # Define Preprocessor
    unified_preprocessor = ColumnTransformer(
        transformers=[
            ('num_processing', Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
                ('scaler', StandardScaler())
            ]), numerical_cols),
            
            ('cat_processing', Pipeline([
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=int))
            ]), categorical_cols)
        ],
        remainder='drop' 
    )

    # Define base pipeline 
    unified_pipeline = Pipeline([
        ("preprocessor", unified_preprocessor),
        ("clf", LGBMClassifier(
            class_weight='balanced',
            random_state=random_state,
            n_jobs=1,
            verbose=-1 
        ))
    ])

    # Define Search Space
    param_dist = {
        'clf__n_estimators': randint(100, 1000),      # Number of trees
        'clf__learning_rate': uniform(0.01, 0.2),     # Learning speed
        'clf__num_leaves': randint(20, 150),          # Tree complexity
        'clf__max_depth': randint(5, 20),             # Depth limit
        'clf__min_child_samples': randint(10, 100),   # Regularization
        'clf__subsample': uniform(0.6, 0.4),          # Bagging fraction
        'clf__colsample_bytree': uniform(0.6, 0.4)    # Feature fraction
    }

    # Train / Test Split
    X_train, X_test, y_train_enc, y_test_enc = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    # Configure Randomized Search
    search = RandomizedSearchCV(
        estimator=unified_pipeline,
        param_distributions=param_dist,
        n_iter=15,            # Try 15 different random combinations
        cv=3,                 # 3-Fold Cross Validation
        scoring='accuracy',
        n_jobs=-1,            # Use all CPU cores
        verbose=1,
        random_state=random_state
    )

    print("Starting Hyperparameter Tuning (RandomizedSearchCV)...")
    search.fit(X_train, y_train_enc)

    # Get best model and results
    best_model = search.best_estimator_
    print(f"\nBest Validation Accuracy (CV): {search.best_score_:.4f}")
    print(f"Best Parameters: {search.best_params_}")

    # Evaluate on local test set
    y_pred_enc = best_model.predict(X_test)
    acc = accuracy_score(y_test_enc, y_pred_enc)

    print(f"Local Test Set Accuracy: {acc:.4f}")

    os.system('say "Your unified model training is finished."')

    return best_model

###
"""
Best Validation Accuracy (CV): 0.8430
Best Parameters: {'clf__colsample_bytree': np.float64(0.8918424713352255), 'clf__learning_rate': np.float64(0.13751149427104264), 'clf__max_depth': 15, 'clf__min_child_samples': 44, 'clf__n_estimators': 838, 'clf__num_leaves': 120, 'clf__subsample': np.float64(0.9886848381556415)}
Local Test Set Accuracy: 0.8489
"""
###

## Parameter search on Catboost

In [None]:
def catboost_params(df, random_state=42, test_size=0.2, target_col='label'):
    
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Get numerical columns
    numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
        
    categorical_cols = ['Domain']

    # Define preprocessor 
    unified_preprocessor = ColumnTransformer(
        transformers=[
            ('num_processing', Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
                ('scaler', StandardScaler())
            ]), numerical_cols),
            
            ('cat_processing', Pipeline([
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=int))
            ]), categorical_cols)
        ],
        remainder='drop' 
    )

    # Pipeline with catboost
    unified_pipeline = Pipeline([
        ("preprocessor", unified_preprocessor),
        ("clf", CatBoostClassifier(
            loss_function='MultiClass',   # Required for your 11 classes
            auto_class_weights='Balanced',# Handles the CoverType imbalance automatically
            verbose=0,                    # Silence the training output
            random_state=random_state,
            thread_count=-1,              # Use all cores for the model training
            allow_writing_files=False
        ))
    ])

    # Parameters to search
    param_dist = {
        'clf__iterations': randint(500, 1200),
        'clf__depth': randint(4, 9),
        'clf__learning_rate': uniform(0.01, 0.2),
        'clf__l2_leaf_reg': randint(1, 10),
        'clf__bagging_temperature': uniform(0, 1),
        'clf__random_strength': uniform(1, 10)
    }

    # Train / Test Split
    X_train, X_test, y_train_enc, y_test_enc = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    # Configure Randomized Search
    search = RandomizedSearchCV(
        estimator=unified_pipeline,
        param_distributions=param_dist,
        n_iter=10,            # Lower than LGBM due to speed
        cv=3,                 
        scoring='accuracy',
        n_jobs=1,             # Sequential search
        verbose=1,
        random_state=random_state
    )

    print("Starting CatBoost Hyperparameter Tuning...")
    search.fit(X_train, y_train_enc)

    # Get best model and results
    best_model = search.best_estimator_
    print(f"\nBest Validation Accuracy (CV): {search.best_score_:.4f}")
    print(f"Best Parameters: {search.best_params_}")

    # Evaluate on local test set
    y_pred_enc = best_model.predict(X_test)

    # CatBoost predict returns an array of shape (N, 1), flatten it
    y_pred_enc = y_pred_enc.ravel() 
    
    acc = accuracy_score(y_test_enc, y_pred_enc)
    print(f"Local Test Set Accuracy: {acc:.4f}")

    os.system('say "Your unified model training is finished."')

    return best_model

"""
Best Params: {'clf__bagging_temperature': np.float64(0.4667628932479799), 'clf__depth': 8, 'clf__iterations': 1146, 'clf__l2_leaf_reg': 5, 'clf__learning_rate': np.float64(0.1000998503939086), 'clf__random_strength': np.float64(1.1326496115986653)}
"""

## Predict

In [41]:
# Load data
df_train = clean_and_combine(tables_train, names, binary_labels)
df_test = clean_and_combine(tables_test, names)

In [44]:
# Train model
lightmodel = lightgbm_params(df_train)
# catmodel = catboost_params(df_train)

Starting Hyperparameter Tuning (RandomizedSearchCV)...
Fitting 3 folds for each of 15 candidates, totalling 45 fits


Exception ignored in: <function ResourceTracker.__del__ at 0x106279bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x104021bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10695dbc0>
Traceback (most recent call last


Best Validation Accuracy (CV): 0.8430
Best Parameters: {'clf__colsample_bytree': np.float64(0.8918424713352255), 'clf__learning_rate': np.float64(0.13751149427104264), 'clf__max_depth': 15, 'clf__min_child_samples': 44, 'clf__n_estimators': 838, 'clf__num_leaves': 120, 'clf__subsample': np.float64(0.9886848381556415)}




Local Test Set Accuracy: 0.8489


In [None]:
predictions = lightgbm_params(df_train)
### Save submission file for lightxgb ###
submission = pd.DataFrame({"Prediction": predictions}, index=range(1, len(predictions) + 1))

### Save submission for catboost, output data is different shape ###
# submission = pd.DataFrame({"Prediction": predictions.ravel()}, index=range(1, len(predictions) + 1))

pd.DataFrame(submission).to_csv('combined_test_sample_submission.csv', index=True, index_label="ID")

Training unified model with tuned parameters...





Global Test Accuracy: 0.8473

--- Accuracy per Domain ---
[Higgs]: 0.8349 (N=34975)
[CoverType]: 0.9087 (N=11616)
[HELOC]: 0.6998 (N=1912)

Exporting local test results to CSV...
Saved 'local_test_results.csv'.
