# NODE

## Setting

In [None]:
# pytorch_tabular: NODE
!pip install -U pytorch_tabular[extra]
!pip install -U pytorch_tabular

In [None]:
import pandas as pd
import numpy as np
import os
import random
import torch

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid

from copy import deepcopy
from rich.progress import Progress

from pytorch_tabular import TabularModel
from pytorch_tabular.models import NodeConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
def seed_everything(seed = 21):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
def read_split_data():
    df = pd.read_csv('/content/drive/MyDrive/Data/adult.csv')
    df.income = np.where(df.income=='>50K', 1, 0)
    X = {}
    y = {}
    X['train'], X['test'] = train_test_split(df, test_size = 0.10, random_state=21)
    X['train'], X['val'] = train_test_split(X['train'], test_size = 0.10, random_state=21)

    return X

## Run

In [None]:
seed_everything()
X = read_split_data()

cat_index = X['train'].select_dtypes(['object']).columns.to_list()
num_index = X['train'].select_dtypes(['int64']).columns.to_list()[:-1]
target = ['income']

In [None]:
data_config = DataConfig(target=target,
                         continuous_cols=num_index,
                         categorical_cols=cat_index)

trainer_config = TrainerConfig(batch_size=64,
                               max_epochs=10,
                               accelerator='gpu',
                               early_stopping='valid_loss',
                               early_stopping_mode ='min',
                               early_stopping_patience=8,
                               checkpoints='valid_loss',
                               checkpoints_mode='min',
                               checkpoints_path='Node_Best',
                               load_best=True,
                               seed=21)

# 논문에서는 Quasi-Hyperbolic Adam 사용
optimizer_config = OptimizerConfig(optimizer='Adam',
                                   optimizer_params={'weight_decay': 1e-5},
                                   lr_scheduler=None)

head_config = LinearHeadConfig(layers="", # No additional layer in head, just a mapping layer to output_dim
                               dropout=0.1,
                               initialization="kaiming").__dict__ # Convert to dict to pass to the model config

model_config = NodeConfig(task='classification',
                          head='LinearHead',
                          head_config=head_config,
                          learning_rate=1e-3,
                          num_layers=2,
                          num_trees=1024,
                          depth=6,
                          additional_tree_output_dim=3,
                          choice_function='entmax15',
                          bin_function='entmoid15',
                          input_dropout=0.0,
                          embed_categorical=True,
                          embedding_dropout=0.1)

tabular_model = TabularModel(data_config=data_config,
                             model_config=model_config,
                             optimizer_config=optimizer_config,
                             trainer_config=trainer_config)

tabular_model.fit(train=X['train'],
                  validation=X['val'])

2023-09-04 15:10:03,276 - {pytorch_tabular.tabular_model:105} - INFO - Experiment Tracking is turned off
INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
INFO:lightning_fabric.utilities.seed:Global seed set to 42
2023-09-04 15:10:03,304 - {pytorch_tabular.tabular_model:473} - INFO - Preparing the DataLoaders
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
2023-09-04 15:10:03,310 - {pytorch_tabular.tabular_datamodule:290} - INFO - Setting up the datamodule for classification task
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
2023-09-04 15:10:03,468 - {pytorch_tabular.tabular_model:521} - INFO - Preparing the Model: NODEModel
INFO:pytorch_tabular.tabular_model:Preparing the Model: NODEModel
2023-09-04 15:10:03,786 - {pytorch_tabular.models.node.node_model:83} - INFO - Data Aware Initialization of NODE using a forward pass with 2000 batch size....
INFO:pytorch_tabular.models.node.node_model:Data Aware Initializat

Output()

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


2023-09-04 15:26:13,787 - {pytorch_tabular.tabular_model:584} - INFO - Training the model completed
INFO:pytorch_tabular.tabular_model:Training the model completed
2023-09-04 15:26:13,792 - {pytorch_tabular.tabular_model:1258} - INFO - Loading the best model
INFO:pytorch_tabular.tabular_model:Loading the best model


<pytorch_lightning.trainer.trainer.Trainer at 0x7b5b55aaae30>

In [None]:
tabular_model.evaluate(X['test'])

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'test_loss': 0.36711040139198303, 'test_accuracy': 0.845854640007019}]

In [None]:
pred_df = tabular_model.predict(X['test'])
pred_df.head()

Output()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,0_probability,1_probability,prediction
37086,27,Private,89813,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Female,0,0,40,United-States,0,0.89702,0.10298,0
34719,31,Private,217962,Some-college,10,Never-married,Protective-serv,Other-relative,Black,Male,0,0,40,?,0,0.894579,0.105421,0
44913,18,Private,62972,11th,7,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,0,0.897167,0.102833,0
37168,24,Private,122166,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,Iran,0,0.89512,0.10488,0
27778,21,Private,312017,Some-college,10,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,0,0.89584,0.104161,0


## Tuning

In [5]:
seed_everything()
X = read_split_data()

cat_index = X['train'].select_dtypes(['object']).columns.to_list()
num_index = X['train'].select_dtypes(['int64']).columns.to_list()[:-1]
target = ['income']

results = []

In [6]:
data_config = DataConfig(target=target,
                         continuous_cols=num_index,
                         categorical_cols=cat_index)

trainer_config = TrainerConfig(batch_size=64,
                               max_epochs=5,
                               accelerator='gpu',
                               early_stopping='valid_loss',
                               early_stopping_mode ='min',
                               early_stopping_patience=8,
                               checkpoints='valid_loss',
                               checkpoints_mode='min',
                               checkpoints_path='Node_Best',
                               load_best=True,
                               seed=21)

optimizer_config = OptimizerConfig(optimizer='Adam',
                                   optimizer_params={'weight_decay': 1e-5},
                                   lr_scheduler=None)

head_config = LinearHeadConfig(layers="",
                               dropout=0.1,
                               initialization="kaiming").__dict__

model_config = NodeConfig(task='classification',
                          head='LinearHead',
                          head_config=head_config,
                          learning_rate=1e-3,
                          num_layers=1,
                          num_trees=512,
                          depth=6,
                          additional_tree_output_dim=3,
                          choice_function='entmax15',
                          bin_function='entmoid15',
                          input_dropout=0.0,
                          embed_categorical=True,
                          embedding_dropout=0.1)

tabular_model = TabularModel(data_config=data_config,
                             model_config=model_config,
                             optimizer_config=optimizer_config,
                             trainer_config=trainer_config)

datamodule = tabular_model.prepare_dataloader(train=X['train'], validation=X['val'], seed=21)
model = tabular_model.prepare_model(datamodule)
tabular_model.train(model, datamodule)

2023-09-05 05:55:25,988 - {pytorch_tabular.tabular_model:105} - INFO - Experiment Tracking is turned off
INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
2023-09-05 05:55:26,036 - {pytorch_tabular.tabular_model:473} - INFO - Preparing the DataLoaders
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
2023-09-05 05:55:26,044 - {pytorch_tabular.tabular_datamodule:290} - INFO - Setting up the datamodule for classification task
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
2023-09-05 05:55:26,277 - {pytorch_tabular.tabular_model:521} - INFO - Preparing the Model: NODEModel
INFO:pytorch_tabular.tabular_model:Preparing the Model: NODEModel
2023-09-05 05:55:26,367 - {pytorch_tabular.models.node.node_model:83} - INFO - Data Aware Initialization of NODE using a forward pass with 2000 batch size....
INFO:pytorch_tabular.models.node.node_model:Data Aware Initialization of NODE using a forward pass with 2000 batch size....
2

Output()

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


2023-09-05 05:57:31,690 - {pytorch_tabular.tabular_model:584} - INFO - Training the model completed
INFO:pytorch_tabular.tabular_model:Training the model completed
2023-09-05 05:57:31,693 - {pytorch_tabular.tabular_model:1258} - INFO - Loading the best model
INFO:pytorch_tabular.tabular_model:Loading the best model


<pytorch_lightning.trainer.trainer.Trainer at 0x7843b16639d0>

In [7]:
result = tabular_model.evaluate(X['test'])

result = result[0]
result["Type"] = "UnTuned"
results.append(result)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

In [None]:
# Define the Grid
param_grid = {"model_config__num_layers": [2, 4],
              "model_config__num_trees": [256],
              "model_config__depth": [6],
              "model_config__additional_tree_output_dim": [2, 3]}

trials = []
with Progress() as progress:
    task = progress.add_task("[green]GridSearch...", total=sum(1 for _ in ParameterGrid(param_grid)))
    for params in ParameterGrid(param_grid):
        trainer_config_t = deepcopy(trainer_config)
        optimizer_config_t = deepcopy(optimizer_config)
        head_config_t = deepcopy(head_config)
        model_config_t = deepcopy(model_config)

        for name, param in params.items():
            root, p = name.split("__")
            if root == "model_config":
                setattr(model_config_t, p, param)
            elif root == "trainer_config":
                setattr(trainer_config_t, p, param)
            elif root == "optimizer_config":
                setattr(optimizer_config_t, p, param)
            elif root == "model_config~head_config":
                model_config_t.head_config[p] = param
            else:
                warnings.warn(f"Unknown parameter defined. Ignoring {name}")

        tabular_model_t = TabularModel(data_config=data_config,
                                       model_config=model_config_t,
                                       optimizer_config=optimizer_config_t,
                                       trainer_config=trainer_config_t,)

        model_t = tabular_model_t.prepare_model(datamodule)
        tabular_model_t.train(model_t, datamodule)
        result_t = tabular_model_t.evaluate(X['val'])[0]
        params.update(result_t)
        trials.append(params)
        progress.update(task, advance=1)

In [13]:
trials_df = pd.DataFrame(trials)
trials_df.head()

Unnamed: 0,model_config__additional_tree_output_dim,model_config__depth,model_config__num_layers,model_config__num_trees,test_loss,test_accuracy
0,2,6,2,256,0.340476,0.845086
1,2,6,4,256,0.346658,0.846679
2,3,6,2,256,0.337868,0.846679
3,3,6,4,256,0.345672,0.845769


In [14]:
# Params with lowest loss
trials_df.loc[trials_df.test_loss.idxmin()]

model_config__additional_tree_output_dim      3.000000
model_config__depth                           6.000000
model_config__num_layers                      2.000000
model_config__num_trees                     256.000000
test_loss                                     0.337868
test_accuracy                                 0.846679
Name: 2, dtype: float64

In [15]:
# Params with highest accuracy
trials_df.loc[trials_df.test_accuracy.idxmax()]

model_config__additional_tree_output_dim      2.000000
model_config__depth                           6.000000
model_config__num_layers                      4.000000
model_config__num_trees                     256.000000
test_loss                                     0.346658
test_accuracy                                 0.846679
Name: 1, dtype: float64