# Benchmarking Tabular ML Datasets
Thom, Jakob and Marit

In [84]:
from tabpfn_client import TabPFNClassifier
from tabpfn.constants import ModelVersion

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from lightgbm import LGBMClassifier

import time
import math


#import seaborn as sns

In [None]:
# To use TabPFN external servers (TabPFN_client), create an account on Prior
PRIOR_TOKEN = '<copy token from prior website>'
tabpfn_client.set_access_token(PRIOR_TOKEN)

## Load in Data, will not change data cleaning logic compared to other models

In [85]:
def load_df(filename, foldername='aml-2025-benchmarking-tabular-ml-datasets'):
    return pd.read_csv(f'{foldername}/{filename}', header=0)

In [86]:
covtype_test = load_df('covtype_test.csv')
covtype_train = load_df('covtype_train.csv')
heloc_test = load_df('heloc_test.csv')
heloc_train = load_df('heloc_train.csv')
higgs_test = load_df('higgs_test.csv')
higgs_train = load_df('higgs_train.csv')

# Make all target columns have the name 'target'
covtype_train.rename(columns={'Cover_Type' : 'label'}, inplace=True)
heloc_train.rename(columns={'RiskPerformance' : 'label'}, inplace=True)
higgs_train.rename(columns={'Label' : 'label'}, inplace=True)

In [87]:
tables_test = [covtype_test, heloc_test, higgs_test]
tables_train = [covtype_train, heloc_train, higgs_train]
names = ['CoverType', 'HELOC', 'Higgs']

In [88]:
# Labels to convert
binary_labels = {
    'Bad': 1,
    'Good': 0,
    's': 1,
    'b': 0
    }

def clean_and_combine(tables, names, binary_labels = None):
    
    cleaned_tables = []

    for table, name in zip(tables, names):
        t = table.copy()

        # Get numerical columns for this specific table
        numerical_cols = t.select_dtypes(include=np.number).columns.tolist()

        # Clean missing values based on domain and remove id column
        if name == 'HELOC':
            for col in numerical_cols:
                t.loc[t[col] < 0, col] = np.nan
        elif name == 'HIGGS':
            t.replace(-999.0, np.nan, inplace=True)
            t = t.drop('EventId')

        # Add domain name
        t['Domain'] = name
        cleaned_tables.append(t)
        
    unified_df = pd.concat(cleaned_tables, ignore_index=True)

    # Handle target labels if provided (Training Data)
    if binary_labels:
        unified_df['label'] = unified_df['label'].astype(str).replace(binary_labels)       # As string first to prevent downcasting warning
        unified_df['label'] = unified_df['label'].astype(int)
    return unified_df

In [89]:
df_train = clean_and_combine(tables_train, names, binary_labels)
df_test = clean_and_combine(tables_test, names)

In [90]:
def evaluate_dataset(df, target_col = "label", id_col=None, train_size=1000, random_state=42, dataset_name=""):

    # Drop ID column
    if id_col is not None and id_col in df.columns:
        df = df.drop(columns=[id_col])

    # Separate features and target
    X = df.drop(columns=[target_col])
    
    unified_columns_list = X.columns.tolist() # This list is needed for making predictions later (column order and nr of columns needs to be the same)

    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        train_size=train_size,
        stratify=y,
        random_state=random_state
    )

    print(f"\n Fitting TabPFN model on {dataset_name or target_col}, shape: {X_train.shape}")
    
    clf = TabPFNClassifier()
    clf.fit(X_train, y_train)

    print("Making Predictions with TabPFN-client")
    
    # Validating results only on 10000 rows, to get an accuracy measure, but not waste too many TabPFN credits - not ideal but local (train set validation) accuracy not so important.
    X_test = X_test.iloc[-10000:]
    y_test = y_test.iloc[-10000:]

    # Predictions in Batches, to see completion time
    n_samples = X_test.shape[0]
    print(n_samples)
    batch_size = 10000
    n_batches = math.ceil(n_samples / batch_size)
    print(f"batch size / batches: {batch_size}/{n_batches}")
    predictions = []

    for i, start in enumerate(range(0, n_samples, batch_size), 1):
        end = min(start + batch_size, n_samples)
        X_batch = X_test.iloc[start:end]
        t0 = time.time()
        pred_batch = clf.predict(X_batch)
        t1 = time.time()
        batches_left = n_batches - i
        print(f"Batch {i}-{n_batches} ({start}-{end}) done in {t1-t0:.2f}s, {batches_left} batches left.")
        
        predictions.append(pred_batch)

    return (X_train, X_test, y_train, y_test), clf, unified_columns_list, predictions

In [96]:
tabpfn_splits, tabpfn_clf, unified_columns_list, preds = evaluate_dataset(
    df_train,
    id_col='EventId',           # leave out id column (removed it before actually, but kept the option in the function)
    train_size=12500,           
    dataset_name="Unified"
)


 Fitting TabPFN model on Unified, shape: (12500, 109)
Making Predictions with TabPFN-client
10000
batch size / batches: 10000/1
(10000, 109)


Processing: 100%|██████████| [00:14<00:00]


Batch 1-1 (0-10000) done in 16.70s, 0 batches left.


In [98]:
accuracy_score(tabpfn_splits[3],preds[0]) #Score on a subset of the Training set

0.9327

# Make predictions

**TabPFN-Client (PRIOR labs) credit calculation:**

- 100,000,000 Credits per day

- Cost of run: api_cost = max((num_train_rows + num_test_rows) * num_cols * n_estimators, 5000) ---> **Can't have more than 50,000 rows per call (model restriction)**

Therefore to conduct predictions on test set, will choose batch_size = 40,000, with num_train_rows being 12,500, each batch of predictions cost a bit less than 50,000,000 credits.

In [100]:
def make_predictions(data: pd.DataFrame, clf, feature_cols: list, batch_size: int , id_col: str = 'EventId') -> pd.DataFrame:
    data_original = data.copy()
    
    if id_col is not None and id_col in data.columns:
        data = data.drop(columns=[id_col])


    data = data[feature_cols]

    # Predictions in Batches, to see completion time
    n_samples = data.shape[0]
    print(n_samples)
    n_batches = math.ceil(n_samples / batch_size)
    print(f"batch size / batches: {batch_size}/{n_batches}")
    predictions = []

    for i, start in enumerate(range(0, n_samples, batch_size), 1):
        end = min(start + batch_size, n_samples)
        X_batch = data.iloc[start:end]
        print(X_batch.shape)
        t0 = time.time()
        pred_batch = clf.predict(X_batch)
        t1 = time.time()
        batches_left = n_batches - i
        print(f"Batch {i}-{n_batches} ({start}-{end}) done in {t1-t0:.2f}s, {batches_left} batches left.")
        
        predictions.append(pred_batch)

    results_df = data_original

    return results_df, predictions

In [101]:
df_results, preds_test = make_predictions(df_test, tabpfn_clf, unified_columns_list, 40000)

79546
batch size / batches: 40000/2
(40000, 109)


Processing: 100%|██████████| [00:18<00:00]


Batch 1-2 (0-40000) done in 30.69s, 1 batches left.
(39546, 109)


Processing: 100%|██████████| [00:18<00:00]


Batch 2-2 (40000-79546) done in 26.51s, 0 batches left.


In [102]:
# Creating output df
np_preds_test = np.concatenate(preds_test)
df_preds_test = pd.DataFrame(np_preds_test, columns=['Prediction'])
df_preds_test['ID'] = df_preds_test.index + 1
df_preds_test[['ID', 'Prediction']].head()

In [104]:
# Creating output .csv
output_filename = "base_combined_test_submission.csv"
df_preds_test[['ID', 'Prediction']].to_csv(output_filename, index=False)
print(f"Successfully created submission file: {output_filename}, DON'T FORGET TO DELETE THE EMPTY LINE FROM END!")

Successfully created submission file: base_combined_test_submission.csv, DON'T FORGET TO DELETE THE EMPTY LINE FROM END!
