In [10]:


#!pip install GRANDE
#!pip install xgboost catboost scikit-learn openml matplotlib seaborn



In [3]:

import os
import warnings
warnings.filterwarnings('ignore')

# GPU Configuration
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import openml


from GRANDE import GRANDE
import xgboost as xgb
from catboost import CatBoostClassifier

# Set random seed for reproducibility
np.random.seed(42)


### Understanding GRANDE

GRANDE extends gradient-based decision trees to ensembles. The key innovation is **instance-wise weighting**, where each tree's contribution depends on which leaf node the sample falls into.

The ensemble prediction is computed as:

$$G(x|W, \mathcal{L}, T, I) = \sigma(w(x|W, \mathcal{L}, T, I)) \cdot p(x|\mathcal{L}, T, I)$$

where:
- $\sigma$ is the softmax function
- $w(x|W, \mathcal{L}, T, I)$ computes instance-wise weights for each tree
- $p(x|\mathcal{L}, T, I)$ are the individual tree predictions

This allows GRANDE to learn both simple rules (for easy instances) and complex patterns (for difficult instances) within a single model.

In [4]:
# Optimized parameter using Optuna with 250 trials from the paper

def get_optimized_params_grande(dataset_name):
    """
    Returns optimized GRANDE hyperparameters from Table 25 of the paper.
    
    Parameters:
    -----------
    dataset_name : str
        Name of the dataset ('wdbc', 'churn', or 'phishing')
    """
    
    params_dict = {
        'wdbc': {
            'depth': 4,
            'n_estimators': 1024,
            'learning_rate_weights': 0.0151,
            'learning_rate_index': 0.0140,
            'learning_rate_values': 0.1127,
            'learning_rate_leaf': 0.1758,
            'optimizer': 'adam',
            'cosine_decay_steps': 0,
            'focal_loss': False,
            'temperature': 0.0,
            'from_logits': True,
            'use_class_weights': True,
            'dropout': 0.5,
            'selected_variables': 0.8941,
            'data_subset_fraction': 0.8480,
        },
        'churn': {
            'depth': 6,
            'n_estimators': 2048,
            'learning_rate_weights': 0.0293,
            'learning_rate_index': 0.0716,
            'learning_rate_values': 0.0179,
            'learning_rate_leaf': 0.0225,
            'optimizer': 'adam',
            'cosine_decay_steps': 1000,
            'focal_loss': False,
            'temperature': 0.0,
            'from_logits': True,
            'use_class_weights': True,
            'dropout': 0.0,
            'selected_variables': 0.6920,
            'data_subset_fraction': 0.8174,
        },
        'phishing': {
            'depth': 6,
            'n_estimators': 2048,
            'learning_rate_weights': 0.0040,
            'learning_rate_index': 0.0118,
            'learning_rate_values': 0.0104,
            'learning_rate_leaf': 0.1850,
            'optimizer': 'adam',
            'cosine_decay_steps': 0.1,
            'focal_loss': False,
            'temperature': 0.0,
            'from_logits': True,
            'use_class_weights': True,
            'dropout': 0.0,
            'selected_variables': 0.9792,
            'data_subset_fraction': 0.9588,
        }
    }
    
    return params_dict[dataset_name]


def get_optimized_params_xgboost(dataset_name):
    """
    Returns optimized XGBoost hyperparameters from Table 27 of the paper.
    
    Parameters:
    -----------
    dataset_name : str
        Name of the dataset ('wdbc', 'churn', or 'phishing')
    """
    
    params_dict = {
        'wdbc': {
            'learning_rate': 0.2640,
            'max_depth': 2,
            'reg_alpha': 0.0007,
            'reg_lambda': 0.0000,
        },
        'churn': {
            'learning_rate': 0.0473,
            'max_depth': 6,
            'reg_alpha': 0.0000,
            'reg_lambda': 0.3132,
        },
        'phishing': {
            'learning_rate': 0.1243,
            'max_depth': 11,
            'reg_alpha': 0.0017,
            'reg_lambda': 0.3710,
        }
    }
    
    return params_dict[dataset_name]


def get_optimized_params_catboost(dataset_name):
    """
    Returns optimized CatBoost hyperparameters from Table 28 of the paper.
    
    Parameters:
    -----------
    dataset_name : str
        Name of the dataset ('wdbc', 'churn', or 'phishing')
    """
    
    params_dict = {
        'wdbc': {
            'learning_rate': 0.1339,
            'depth': 3,
            'l2_leaf_reg': 0.7173,
        },
        'churn': {
            'learning_rate': 0.0248,
            'depth': 9,
            'l2_leaf_reg': 7.0362,
        },
        'phishing': {
            'learning_rate': 0.0239,
            'depth': 8,
            'l2_leaf_reg': 1.6860,
        }
    }
    
    return params_dict[dataset_name]


print("OK")

OK


In [5]:
#Loading dataset
def load_dataset(dataset_id, dataset_name, split_data=True):

    print(f"Loading dataset: {dataset_name} (ID: {dataset_id})")
    
    # Load dataset from OpenML
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )
    
    # Get categorical feature indices
    categorical_feature_indices = [
        idx for idx, is_cat in enumerate(categorical_indicator) if is_cat
    ]
    
    # Encode labels to numeric if they are strings
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    print(f"  - Original labels: {label_encoder.classes_}")
    print(f"  - Encoded to: {np.unique(y_encoded)}")
    print(f"  - Total samples: {len(X)}")
    print(f"  - Features: {X.shape[1]}")
    print(f"  - Categorical features: {len(categorical_feature_indices)}")
    
    if not split_data:
        # Return full dataset for CV
        return {
            'X': X,
            'y': y_encoded,
            'cat_idx': categorical_feature_indices,
            'feature_names': attribute_names,
            'label_encoder': label_encoder
        }
    
    # Original splitting logic
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
    )
    
    print(f"  - Train samples: {len(X_train)}")
    print(f"  - Validation samples: {len(X_valid)}")
    print(f"  - Test samples: {len(X_test)}")
    
    return {
        'name': dataset_name,
        'X_train': X_train,
        'X_valid': X_valid,
        'X_test': X_test,
        'y_train': y_train,
        'y_valid': y_valid,
        'y_test': y_test,
        'cat_idx': categorical_feature_indices,
        'feature_names': attribute_names,
        'label_encoder': label_encoder
    }

In [6]:

# Training functions

def train_grande(data, dataset_name):

    
    # Load optimized parameters for this dataset
    params = get_optimized_params_grande(dataset_name)
    
    args = {
        'epochs': 1000,
        'early_stopping_epochs': 25,
        'batch_size': 64,
        'cat_idx': data['cat_idx'],
        'objective': 'binary',
        'random_seed': 42,
        'verbose': 0,
    }
    

    
    # Ensure labels are integers
    y_train = np.array(data['y_train']).astype(int)
    y_valid = np.array(data['y_valid']).astype(int)
    y_test = np.array(data['y_test']).astype(int)
    
    model = GRANDE(params=params, args=args)
    model.fit(
        X_train=data['X_train'],
        y_train=y_train,
        X_val=data['X_valid'],
        y_val=y_valid
    )
    
    preds = model.predict(data['X_test'])
    
    #  Metrics
    y_pred = np.round(preds[:, 1]).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    roc_auc = roc_auc_score(y_test, preds[:, 1])
    

    
    return model, {
        'accuracy': accuracy,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'predictions': preds
    }


def train_xgboost(data, dataset_name):

    
    
    # Load optimized parameters for this dataset
    opt_params = get_optimized_params_xgboost(dataset_name)
    
    xgb_params = {
        'n_estimators': 1000,
        'learning_rate': opt_params['learning_rate'],
        'max_depth': opt_params['max_depth'],
        'reg_alpha': opt_params['reg_alpha'],
        'reg_lambda': opt_params['reg_lambda'],
        'random_state': 42,
        'eval_metric': 'logloss',
        'early_stopping_rounds': 25
    }
    
    # Convert categorical features
    X_train_enc = data['X_train'].copy()
    X_test_enc = data['X_test'].copy()
    
    for idx in data['cat_idx']:
        le = LabelEncoder()
        train_col = X_train_enc.iloc[:, idx]
        test_col = X_test_enc.iloc[:, idx]
        
        if hasattr(train_col, 'cat'):
            train_values = train_col.astype(str)
            test_values = test_col.astype(str)
        else:
            train_values = train_col.fillna('missing').astype(str)
            test_values = test_col.fillna('missing').astype(str)
        
        X_train_enc.iloc[:, idx] = le.fit_transform(train_values)
        
        test_encoded = []
        for val in test_values:
            if val in le.classes_:
                test_encoded.append(le.transform([val])[0])
            else:
                test_encoded.append(-1)
        X_test_enc.iloc[:, idx] = test_encoded
    
    model = xgb.XGBClassifier(**xgb_params)
    
    model.fit(
        X_train_enc, data['y_train'],
        eval_set=[(X_test_enc, data['y_test'])],
        verbose=False
    )
    
    preds_proba = model.predict_proba(X_test_enc)
    y_pred = model.predict(X_test_enc)
    
    y_test = np.array(data['y_test']).astype(int)
    y_pred = np.array(y_pred).astype(int)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    roc_auc = roc_auc_score(y_test, preds_proba[:, 1])
    
    return model, {
        'accuracy': accuracy,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'predictions': preds_proba
    }


def train_catboost(data, dataset_name):

    
    # Load optimized parameters for this dataset
    opt_params = get_optimized_params_catboost(dataset_name)
    
    cat_params = {
        'iterations': 1000,
        'learning_rate': opt_params['learning_rate'],
        'depth': opt_params['depth'],
        'l2_leaf_reg': opt_params['l2_leaf_reg'],
        'random_state': 42,
        'verbose': False,
        'early_stopping_rounds': 25,
        'cat_features': data['cat_idx'] if len(data['cat_idx']) > 0 else None
    }
    
    y_train = np.array(data['y_train']).astype(int)
    y_test = np.array(data['y_test']).astype(int)
    
    model = CatBoostClassifier(**cat_params)
    
    model.fit(
        data['X_train'], y_train,
        eval_set=(data['X_test'], y_test)
    )
    
    preds_proba = model.predict_proba(data['X_test'])
    y_pred = model.predict(data['X_test']).astype(int)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    roc_auc = roc_auc_score(y_test, preds_proba[:, 1])
    

    return model, {
        'accuracy': accuracy,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'predictions': preds_proba
    }



print("OK")

OK


In [14]:
# Cross validation

def evaluate_cv_5fold(dataset_id, dataset_name, dataset_key):

    from sklearn.model_selection import StratifiedKFold
    
    # Load full dataset without splitting
    full_data = load_dataset(dataset_id, dataset_name, split_data=False)
    
    X = full_data['X']
    y = full_data['y']
    cat_idx = full_data['cat_idx']
    
    # Storage for results
    results = {
        'GRANDE': {'f1': [], 'accuracy': [], 'roc_auc': []},
        'XGBoost': {'f1': [], 'accuracy': [], 'roc_auc': []},
        'CatBoost': {'f1': [], 'accuracy': [], 'roc_auc': []}
    }
    
    # 5-fold CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    print(f"\nStarting 5-fold cross-validation...")
    
    for fold, (train_val_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"\nFold {fold}/5") 
        
        # Split data
        X_train_val = X.iloc[train_val_idx]
        X_test = X.iloc[test_idx]
        y_train_val = y[train_val_idx]
        y_test = y[test_idx]
        
        # Split train_val into train and validation
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val
        )
        
        # Create fold data dictionary
        fold_data = {
            'X_train': X_train,
            'X_valid': X_valid,
            'X_test': X_test,
            'y_train': y_train,
            'y_valid': y_valid,
            'y_test': y_test,
            'cat_idx': cat_idx
        }
        
        # Train models
        _, grande_res = train_grande(fold_data, dataset_key)
        results['GRANDE']['f1'].append(grande_res['f1_score'])
        results['GRANDE']['accuracy'].append(grande_res['accuracy'])
        results['GRANDE']['roc_auc'].append(grande_res['roc_auc'])
        
        _, xgb_res = train_xgboost(fold_data, dataset_key)
        results['XGBoost']['f1'].append(xgb_res['f1_score'])
        results['XGBoost']['accuracy'].append(xgb_res['accuracy'])
        results['XGBoost']['roc_auc'].append(xgb_res['roc_auc'])
        
        _, cat_res = train_catboost(fold_data, dataset_key)
        results['CatBoost']['f1'].append(cat_res['f1_score'])
        results['CatBoost']['accuracy'].append(cat_res['accuracy'])
        results['CatBoost']['roc_auc'].append(cat_res['roc_auc'])
    
    # Create results DataFrame
    results_list = []
    for model in ['GRANDE', 'XGBoost', 'CatBoost']:
        results_list.append({
            'Model': model,
            'F1-Score': f"{np.mean(results[model]['f1']):.4f} ± {np.std(results[model]['f1']):.4f}",
            'Accuracy': f"{np.mean(results[model]['accuracy']):.4f} ± {np.std(results[model]['accuracy']):.4f}",
            'ROC-AUC': f"{np.mean(results[model]['roc_auc']):.4f} ± {np.std(results[model]['roc_auc']):.4f}"
        })
    
    return pd.DataFrame(results_list), results

print("OK")

OK



### Experiment 1: Small Dataset - WDBC (Breast Cancer)

Starting with the Wisconsin Diagnostic Breast Cancer (WDBC) dataset, which is a small dataset with 569 samples and 30 numerical features. This demonstrates GRANDE's performance on datasets with limited training data.

In [8]:

results_wdbc, details_wdbc = evaluate_cv_5fold(1510, "WDBC", 'wdbc')

print("WDBC RESULTS (5-Fold CV)")
print(results_wdbc.to_string(index=False))
print("\nPaper: GRANDE 0.975±0.010, XGBoost 0.953±0.030, CatBoost 0.963±0.023")


Loading dataset: WDBC (ID: 1510)
  - Original labels: ['1' '2']
  - Encoded to: [0 1]
  - Total samples: 569
  - Features: 30
  - Categorical features: 0

Starting 5-fold cross-validation...

Fold 1/5

Fold 2/5

Fold 3/5

Fold 4/5

Fold 5/5
WDBC RESULTS (5-Fold CV)
   Model        F1-Score        Accuracy         ROC-AUC
  GRANDE 0.9643 ± 0.0091 0.9666 ± 0.0085 0.9930 ± 0.0074
 XGBoost 0.9601 ± 0.0213 0.9631 ± 0.0195 0.9909 ± 0.0064
CatBoost 0.9605 ± 0.0197 0.9631 ± 0.0187 0.9948 ± 0.0047

Paper: GRANDE 0.975±0.010, XGBoost 0.953±0.030, CatBoost 0.963±0.023


### Experiment 2: Medium Dataset - Churn Prediction

Testing on the Churn dataset (5,000 samples, 20 features) which includes both numerical and categorical features. 

In [9]:

results_churn, details_churn = evaluate_cv_5fold(40701, "Churn", 'churn')
print("CHURN RESULTS (5-Fold CV)")
print(results_churn.to_string(index=False))
print("\nPaper: GRANDE 0.914±0.017, XGBoost 0.900±0.017, CatBoost 0.869±0.021")


Loading dataset: Churn (ID: 40701)
  - Original labels: ['0' '1']
  - Encoded to: [0 1]
  - Total samples: 5000
  - Features: 20
  - Categorical features: 4

Starting 5-fold cross-validation...

Fold 1/5

Fold 2/5

Fold 3/5

Fold 4/5

Fold 5/5
CHURN RESULTS (5-Fold CV)
   Model        F1-Score        Accuracy         ROC-AUC
  GRANDE 0.9063 ± 0.0041 0.9576 ± 0.0022 0.9213 ± 0.0174
 XGBoost 0.9095 ± 0.0098 0.9594 ± 0.0038 0.9219 ± 0.0177
CatBoost 0.9100 ± 0.0096 0.9598 ± 0.0037 0.9264 ± 0.0164

Paper: GRANDE 0.914±0.017, XGBoost 0.900±0.017, CatBoost 0.869±0.021



### Experiment 3: Understanding Instance-Wise Weighting

The PhishingWebsites dataset is perfect for demonstrating GRANDE's instance-wise weighting capability. Some phishing websites can be identified using simple rules (e.g., "has prefix/suffix in domain"), while others require complex patterns.


In [10]:

results_phishing, details_phishing = evaluate_cv_5fold(4534, "PhishingWebsites", 'phishing')
print("PHISHINGWEBSITES RESULTS (5-Fold CV)")
print(results_phishing.to_string(index=False))
print("\nPaper: GRANDE 0.969±0.006, XGBoost 0.968±0.006, CatBoost 0.965±0.003")

Loading dataset: PhishingWebsites (ID: 4534)
  - Original labels: ['-1' '1']
  - Encoded to: [0 1]
  - Total samples: 11055
  - Features: 30
  - Categorical features: 30

Starting 5-fold cross-validation...

Fold 1/5

Fold 2/5

Fold 3/5

Fold 4/5

Fold 5/5
PHISHINGWEBSITES RESULTS (5-Fold CV)
   Model        F1-Score        Accuracy         ROC-AUC
  GRANDE 0.9680 ± 0.0028 0.9684 ± 0.0028 0.9958 ± 0.0007
 XGBoost 0.9677 ± 0.0022 0.9682 ± 0.0022 0.9959 ± 0.0005
CatBoost 0.9652 ± 0.0016 0.9657 ± 0.0015 0.9955 ± 0.0006

Paper: GRANDE 0.969±0.006, XGBoost 0.968±0.006, CatBoost 0.965±0.003


#### REPRODUCTION 
**Experimental Reproduction Methodology**

This notebook reproduces the key experiments from "GRANDE: Gradient-Based Decision Tree Ensembles for Tabular Data" (Marton et al., ICLR 2024). The reproduction focused on three representative datasets from the paper's benchmark suite: WDBC (569 samples, 30 features), Churn (5,000 samples, 20 features with 4 categorical), and PhishingWebsites (11,055 samples, 30 categorical features).

**Reproduction Approach**
The experiments were conducted using the following methodology aligned with the paper's evaluation protocol:
- Model Implementation: GRANDE v0.1.6 from PyPI, XGBoost 3.0.5, CatBoost 1.2.8.
- Optimized Hyperparameters extracted directly from the paper's appendix (Tables 25, 27, 28).
- Stratified 5-fold cross-validation matching the paper's methodology, with each fold using an 80/20 train-validation split for early stopping.
- Computational Environment: Python 3.11, TensorFlow 2.16.1, scikit-learn 1.3.2.
- Datasets loaded directly from OpenML using the same dataset IDs as specified in the paper.
- Relied on GRANDE's built-in preprocessing pipeline, which handles categorical encoding and feature transformation automatically.
  
**Experimental Conditions and Limitations**
- The paper's experiments were likely conducted in 2023 with earlier library versions. This reproduction uses current releases (2024-2025), which may have different optimization behaviors, numerical precision, or default parameter handling.
- While the paper specifies leave-one-out encoding for high-cardinality categorical features and quantile transformation, the exact implementation details were not replicated. Instead, GRANDE's internal preprocessing was used, which may differ from the authors' original code.
- All experiments used a fixed random seed (42) for reproducibility.
- Experiments ran on Windows CPU.
- 
**Results Comparison with Original Paper**
  
WDBC Dataset
\begin{array}{lccc}
\textbf{Model} & \textbf{Paper F1-Score} & \textbf{Reproduction F1-Score}\\
\hline
\text{GRANDE} & $0.975 \pm 0.010$ & $0.964 \pm 0.009$ \\
\text{XGBoost} & $0.953 \pm 0.030$ & $0.960 \pm 0.021$ \\
\text{CatBoost} & $0.963 \pm 0.023$ & $0.961 \pm 0.020$ \\
\end{array}
Assessment: All methods achieve comparable performance with reported values. The paper showed GRANDE with clear superiority, while reproduction shows near-parity among all methods.

Churn Dataset

\begin{array}{lccc}
\textbf{Model} & \textbf{Paper F1-Score} & \textbf{Reproduction F1-Score}\\
\hline
\text{GRANDE} & $0.914 \pm 0.017$ & $0.906 \pm 0.004$ \\
\text{XGBoost} & $0.900 \pm 0.017$ & $0.910 \pm 0.010$ \\
\text{CatBoost} & $0.869 \pm 0.021$ & $0.910 \pm 0.010$ \\
\end{array}
Assessment: CatBoost substantially outperforms the paper's reported results, reversing the ranking from GRANDE-first to CatBoost-first. The reason could be different fold variability or potential methodological differences.

PhishingWebsites Dataset

\begin{array}{lccc}
\textbf{Model} & \textbf{Paper F1-Score} & \textbf{Reproduction F1-Score} \\
\hline
\text{GRANDE} & $0.969 \pm 0.006$ & $0.968 \pm 0.003$ \\
\text{XGBoost} & $0.968 \pm 0.006$ & $0.968 \pm 0.002$ \\
\text{CatBoost} & $0.965 \pm 0.003$ & $0.965 \pm 0.002$ \\
\end{array}
Assessment: All methods match the paper's results with very small diferences.

While PhishingWebsites shows the expected competitive performance, WDBC shows reduced GRANDE advantage, and Churn shows unexpected CatBoost superiority. This suggests performance rankings may be sensitive to implementation details. Different results between the reproduction and the paper may indicate differences in data splitting, preprocessing, or inherent properties of library implementations. Two of three datasets show non-trivial deviations, suggesting that exact numerical reproduction requires matching the complete software stack and implementation data.

The reproduction uses an explicit 80/20 train-validation split within each fold for GRANDE's early stopping. The paper's description is ambiguous about whether a separate validation set was used or whether early stopping relied on other criteria. Different validation strategies affect the effective training data size and model convergence.

### References

https://arxiv.org/pdf/2309.17130

https://github.com/s-marton/GRANDE
