# Benchmarking Tabular ML Datasets
Thom, Jakob and Marit

In [1]:
import tabpfn_client
from tabpfn_client import TabPFNClassifier
from tabpfn.constants import ModelVersion

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from lightgbm import LGBMClassifier

import time
import math


#import seaborn as sns

In [28]:
# To use TabPFN external servers (TabPFN_client), create an account on Prior
PRIOR_TOKEN = '<copy paste your token from PRIOR LABS>'
tabpfn_client.set_access_token(PRIOR_TOKEN)

## Load in Data, will not change data cleaning logic compared to other models

In [3]:
def load_df(filename, foldername='aml-2025-benchmarking-tabular-ml-datasets'):
    return pd.read_csv(f'{foldername}/{filename}', header=0)

In [4]:
covtype_test = load_df('covtype_test.csv')
covtype_train = load_df('covtype_train.csv')
heloc_test = load_df('heloc_test.csv')
heloc_train = load_df('heloc_train.csv')
higgs_test = load_df('higgs_test.csv')
higgs_train = load_df('higgs_train.csv')

# Make all target columns have the name 'target'
covtype_train.rename(columns={'Cover_Type' : 'label'}, inplace=True)
heloc_train.rename(columns={'RiskPerformance' : 'label'}, inplace=True)
higgs_train.rename(columns={'Label' : 'label'}, inplace=True)

In [5]:
tables_test = [covtype_test, heloc_test, higgs_test]
tables_train = [covtype_train, heloc_train, higgs_train]
names = ['CoverType', 'HELOC', 'Higgs']

In [6]:
# Labels to convert
binary_labels = {
    'Bad': 1,
    'Good': 0,
    's': 1,
    'b': 0
    }

def clean_and_combine(tables, names, binary_labels = None):
    
    cleaned_tables = []

    for table, name in zip(tables, names):
        t = table.copy()

        # Get numerical columns for this specific table
        numerical_cols = t.select_dtypes(include=np.number).columns.tolist()

        # Clean missing values based on domain and remove id column
        if name == 'HELOC':
            for col in numerical_cols:
                t.loc[t[col] < 0, col] = np.nan
        elif name == 'HIGGS':
            t.replace(-999.0, np.nan, inplace=True)
            t = t.drop('EventId')

        # Add domain name
        t['Domain'] = name
        cleaned_tables.append(t)
        
    unified_df = pd.concat(cleaned_tables, ignore_index=True)

    # Handle target labels if provided (Training Data)
    if binary_labels:
        unified_df['label'] = unified_df['label'].astype(str).replace(binary_labels)       # As string first to prevent downcasting warning
        unified_df['label'] = unified_df['label'].astype(int)
    return unified_df

In [7]:
df_train = clean_and_combine(tables_train, names, binary_labels)
df_train = df_train.drop(columns=['Weight'])
df_test = clean_and_combine(tables_test, names)
df_test = df_test.drop(columns=['Weight'])

In [8]:
df_train['Domain'].value_counts()

Domain
Higgs        175000
CoverType     58101
HELOC          9413
Name: count, dtype: int64

In [10]:
def fit_tabpfn(df, target_col = "label", id_col=None, train_size=1000, random_state=42, dataset_name=""):

    # Drop ID column
    if id_col is not None and id_col in df.columns:
        df = df.drop(columns=[id_col])

    # Separate features and target
    X = df.drop(columns=[target_col])
    
    unified_columns_list = X.columns.tolist() # This list is needed for making predictions later (column order and nr of columns needs to be the same)

    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        train_size=train_size,
        stratify=y,
        random_state=random_state
    )

    print(f"\n Fitting TabPFN model on {dataset_name or target_col}, shape: {X_train.shape}")
    
    clf = TabPFNClassifier()
    clf.fit(X_train, y_train)

    print("Making Predictions with TabPFN-client")
    
    # Validating results only on 10000 rows, to get an accuracy measure, but not waste too many TabPFN credits - not ideal but local (train set validation) accuracy not so important.
    X_test_small = X_test.iloc[-10000:]
    y_test_small = y_test.iloc[-10000:]

    

    # Predictions in Batches, to see completion time
    n_samples = X_test_small.shape[0]
    print(n_samples)
    batch_size = 10000
    n_batches = math.ceil(n_samples / batch_size)
    print(f"batch size / batches: {batch_size}/{n_batches}")
    predictions = []

    for i, start in enumerate(range(0, n_samples, batch_size), 1):
        end = min(start + batch_size, n_samples)
        X_batch = X_test_small.iloc[start:end]
        t0 = time.time()
        pred_batch = clf.predict(X_batch)
        t1 = time.time()
        batches_left = n_batches - i
        print(f"Batch {i}-{n_batches} ({start}-{end}) done in {t1-t0:.2f}s, {batches_left} batches left.")
        
        predictions.append(pred_batch)

    return (X_train, X_test, y_train, y_test, X_test_small, y_test_small), clf, unified_columns_list, predictions

In [11]:
tabpfn_splits, tabpfn_clf, unified_columns_list, preds = fit_tabpfn(
    df_train,
    id_col='EventId',           # leave out id column (removed it before actually, but kept the option in the function)
    train_size=12500,           
    dataset_name="Unified"
)


 Fitting TabPFN model on Unified, shape: (12500, 108)
Making Predictions with TabPFN-client
10000
batch size / batches: 10000/1


Processing: 100%|██████████| [00:08<00:00]

Batch 1-1 (0-10000) done in 10.16s, 0 batches left.





In [35]:
print(f"Overall Accuracy on Validation Set: {accuracy_score(tabpfn_splits[5],preds[0])}") #Score on a subset of the Training set

Overall Accuracy on Validation Set: 0.8088


# Assessing Model Performance Across Datasets

In [13]:
# Combine X_test and y_test (split from df_train)
df_validation = tabpfn_splits[1].copy()
df_validation['label'] = tabpfn_splits[3]

In [14]:
# Combining df_validation enables us to get X, y for each individual dataset
df_validation_cov = df_validation[df_validation['Domain'] == "CoverType"]
df_validation_heloc = df_validation[df_validation['Domain'] == "HELOC"]
df_validation_higgs = df_validation[df_validation['Domain'] == "Higgs"]

In [20]:
def evaluate_dataset(tabpfn_fit, df, feature_cols, target_col = "label", id_col=None, batch_size=1000, random_state=42, dataset_name=""):
    
    # Drop ID column
    if id_col is not None and id_col in df.columns:
        df = df.drop(columns=[id_col])

    # Separate features and target
    X = df.drop(columns=[target_col])
    X = X[feature_cols]

    y = df[target_col]

    X_val_discarded, X_val_sample, y_val_discarded, y_val_sample = train_test_split(
    X, y,
    test_size=batch_size,
    stratify=y,
    random_state=random_state
    )

    print(f"\n Making predictions on {dataset_name or target_col}, shape: {X_val_sample.shape}")
    # Predictions in Batches, to see completion time
    n_samples = X_val_sample.shape[0]
    n_batches = math.ceil(n_samples / batch_size)
    print(f"batch size / batches: {batch_size}/{n_batches}")
    
    predictions = []

    for i, start in enumerate(range(0, n_samples, batch_size), 1):
        end = min(start + batch_size, n_samples)
        X_batch = X_val_sample.iloc[start:end]
        t0 = time.time()
        pred_batch = tabpfn_fit.predict(X_batch)
        t1 = time.time()
        batches_left = n_batches - i
        print(f"Batch {i}-{n_batches} ({start}-{end}) done in {t1-t0:.2f}s, {batches_left} batches left.")
        
        predictions.append(pred_batch)

    return predictions, y_val_sample

In [31]:
preds_cov, y_preds_cov = evaluate_dataset(tabpfn_clf, df_validation_cov, unified_columns_list, id_col='EventId', batch_size=10000, random_state=42, dataset_name="CoverType")
preds_heloc, y_preds_heloc = evaluate_dataset(tabpfn_clf, df_validation_heloc, unified_columns_list, id_col='EventId', batch_size=5000, random_state=42, dataset_name="HELOC")
preds_higgs, y_preds_higgs = evaluate_dataset(tabpfn_clf, df_validation_higgs, unified_columns_list, id_col='EventId', batch_size=10000, random_state=42, dataset_name="Higgs")


 Making predictions on CoverType, shape: (10000, 108)
batch size / batches: 10000/1


Processing: 100%|██████████| [00:08<00:00]


Batch 1-1 (0-10000) done in 14.05s, 0 batches left.

 Making predictions on HELOC, shape: (5000, 108)
batch size / batches: 5000/1


Processing: 100%|██████████| [00:07<00:00]


Batch 1-1 (0-5000) done in 9.01s, 0 batches left.

 Making predictions on Higgs, shape: (10000, 108)
batch size / batches: 10000/1


Processing: 100%|██████████| [00:08<00:00]

Batch 1-1 (0-10000) done in 11.37s, 0 batches left.





### Results across Datasets

In [36]:
print(f"Overall Accuracy on Validation Set: {accuracy_score(tabpfn_splits[5],preds[0])}") #Score on a subset of the Training set

Overall Accuracy on Validation Set: 0.8088


In [34]:
print(f"Accuracy on CoverType: {accuracy_score(preds_cov[0], y_preds_cov)}") #Score on a subset of the Training set

Accuracy on CoverType: 0.7566


In [32]:
print(f"Accuracy on Heloc: {accuracy_score(preds_heloc[0], y_preds_heloc)}") #Score on a subset of the Training set

Accuracy on Heloc: 0.705


In [33]:
print(f"Accuracy on Higgs: {accuracy_score(preds_higgs[0], y_preds_higgs)}") #Score on a subset of the Training set

Accuracy on Higgs: 0.836


# Combine into CSVs

### CSVs used for results validation for poster

In [48]:
# Combined dataset
csv_combined = tabpfn_splits[4][['Domain']].copy()
csv_combined['label'] = preds[0]
csv_combined['prediction'] = tabpfn_splits[5]

# Creating output .csv
output_filename = "combined_validation.csv"
csv_combined.to_csv(output_filename, index=False)
print(f"Successfully created submission file: {output_filename}")

Successfully created submission file: combined_validation.csv


In [49]:
# Combined dataset
csv_cov = pd.DataFrame({
    'label': y_preds_cov,
    'Prediction': preds_cov[0],
    'Domain': 'CoverType'
})

# Creating output .csv
output_filename = "covertype_validation.csv"
csv_cov.to_csv(output_filename, index=False)
print(f"Successfully created submission file: {output_filename}")

Successfully created submission file: covertype_validation.csv


In [50]:
# Combined dataset
csv_heloc = pd.DataFrame({
    'label': y_preds_heloc,
    'Prediction': preds_heloc[0],
    'Domain': 'HELOC'
})

# Creating output .csv
output_filename = "heloc_validation.csv"
csv_heloc.to_csv(output_filename, index=False)
print(f"Successfully created submission file: {output_filename}")

Successfully created submission file: heloc_validation.csv


In [51]:
# Combined dataset
csv_higgs = pd.DataFrame({
    'label': y_preds_higgs,
    'Prediction': preds_higgs[0],
    'Domain': 'Higgs'
})

# Creating output .csv
output_filename = "higgs_validation.csv"
csv_higgs.to_csv(output_filename, index=False)
print(f"Successfully created submission file: {output_filename}")

Successfully created submission file: higgs_validation.csv


# Make predictions

**TabPFN-Client (PRIOR labs) credit calculation:**

- 100,000,000 Credits per day

- Cost of run: api_cost = max((num_train_rows + num_test_rows) * num_cols * n_estimators, 5000) ---> **Can't have more than 50,000 rows per call (model restriction)**

Therefore to conduct predictions on test set, will choose batch_size = 40,000, with num_train_rows being 12,500, each batch of predictions cost a bit less than 50,000,000 credits.

In [None]:
def make_predictions(data: pd.DataFrame, clf, feature_cols: list, batch_size: int , id_col: str = 'EventId') -> pd.DataFrame:
    data_original = data.copy()
    
    if id_col is not None and id_col in data.columns:
        data = data.drop(columns=[id_col])


    data = data[feature_cols]

    # Predictions in Batches, to see completion time
    n_samples = data.shape[0]
    print(n_samples)
    n_batches = math.ceil(n_samples / batch_size)
    print(f"batch size / batches: {batch_size}/{n_batches}")
    predictions = []

    for i, start in enumerate(range(0, n_samples, batch_size), 1):
        end = min(start + batch_size, n_samples)
        X_batch = data.iloc[start:end]
        print(X_batch.shape)
        t0 = time.time()
        pred_batch = clf.predict(X_batch)
        t1 = time.time()
        batches_left = n_batches - i
        print(f"Batch {i}-{n_batches} ({start}-{end}) done in {t1-t0:.2f}s, {batches_left} batches left.")
        
        predictions.append(pred_batch)

    results_df = data_original

    return results_df, predictions

In [None]:
df_results, preds_test = make_predictions(df_test, tabpfn_clf, unified_columns_list, 40000)

In [None]:
# Creating output df
np_preds_test = np.concatenate(preds_test)
df_preds_test = pd.DataFrame(np_preds_test, columns=['Prediction'])
df_preds_test['ID'] = df_preds_test.index + 1
df_preds_test[['ID', 'Prediction']].head()

In [None]:
# Creating output .csv
output_filename = "base_combined_test_submission.csv"
df_preds_test[['ID', 'Prediction']].to_csv(output_filename, index=False)
print(f"Successfully created submission file: {output_filename}, DON'T FORGET TO DELETE THE EMPTY LINE FROM END!")