In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [None]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()


<a target="_blank" href="https://colab.research.google.com/github/yandex-research/tabm/blob/main/example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# TabM

This notebook provides a usage example of the `tabm` package from the
[TabM](https://github.com/yandex-research/tabm) project.

In [None]:
!pip install rtdl_num_embeddings
!pip install tabm



In [None]:
import math
import random
from copy import deepcopy
from typing import Any, Literal, NamedTuple, Optional

import numpy as np
import rtdl_num_embeddings  # https://github.com/yandex-research/rtdl-num-embeddings
import scipy.special
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
import tabm
import torch
import torch.nn as nn
import torch.optim
from torch import Tensor

from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score

In [None]:
seed = 0
random.seed(seed)
np.random.seed(seed + 1)
torch.manual_seed(seed + 2)
pass

# Dataset example

In [None]:
# >>> Dataset.
TaskType = Literal['regression', 'binclass', 'multiclass']

# Regression.
task_type: TaskType = 'regression'
n_classes = None
dataset = sklearn.datasets.fetch_california_housing()
X_num: np.ndarray = dataset['data']
Y: np.ndarray = dataset['target']

# Classification.
# n_classes = 2
# assert n_classes >= 2
# task_type: TaskType = 'binclass' if n_classes == 2 else 'multiclass'
# X_num, Y = sklearn.datasets.make_classification(
#     n_samples=20000,
#     n_features=8,
#     n_classes=n_classes,
#     n_informative=3,
#     n_redundant=2,
# )

task_is_regression = task_type == 'regression'

# >>> Numerical (continuous) features.
X_num: np.ndarray = X_num.astype(np.float32)
n_num_features = X_num.shape[1]

# >>> Categorical features.
# NOTE: the above datasets do not have categorical features, however,
# for the demonstration purposes, it is possible to generate them.
cat_cardinalities = [
    # NOTE: uncomment the two lines below to add two categorical features.
    # 4,  # Allowed values: [0, 1, 2, 3].
    # 7,  # Allowed values: [0, 1, 2, 3, 4, 5, 6].
]
X_cat = (
    np.column_stack([np.random.randint(0, c, (len(X_num),)) for c in cat_cardinalities])
    if cat_cardinalities
    else None
)

# >>> Labels.
if task_type == 'regression':
    Y = Y.astype(np.float32)
else:
    assert n_classes is not None
    Y = Y.astype(np.int64)
    assert set(Y.tolist()) == set(range(n_classes)), (
        'Classification labels must form the range [0, 1, ..., n_classes - 1]'
    )

# >>> Split the dataset.
all_idx = np.arange(len(Y))
trainval_idx, test_idx = sklearn.model_selection.train_test_split(
    all_idx, train_size=0.8
)
train_idx, val_idx = sklearn.model_selection.train_test_split(
    trainval_idx, train_size=0.8
)
data_numpy = {
    'train': {'x_num': X_num[train_idx], 'y': Y[train_idx]},
    'val': {'x_num': X_num[val_idx], 'y': Y[val_idx]},
    'test': {'x_num': X_num[test_idx], 'y': Y[test_idx]},
}
if X_cat is not None:
    data_numpy['train']['x_cat'] = X_cat[train_idx]
    data_numpy['val']['x_cat'] = X_cat[val_idx]
    data_numpy['test']['x_cat'] = X_cat[test_idx]

for part, part_data in data_numpy.items():
    for key, value in part_data.items():
        print(f'{part:<5}    {key:<5}    {value.shape!r:<10}    {value.dtype}')
        del key, value
    del part, part_data

train    x_num    (13209, 8)    float32
train    y        (13209,)      float32
val      x_num    (3303, 8)     float32
val      y        (3303,)       float32
test     x_num    (4128, 8)     float32
test     y        (4128,)       float32


# tasks

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

## Abalone

In [None]:
abalone= pd.read_csv("abalone.csv")

# Data (as pandas DataFrames)
X = abalone.iloc[:,:-1]
y = abalone["Rings"]

# Because RMSLE score, we make a conversion like below:
y_log = y #np.log(1 + y)
y_train=y_log.iloc[:2784]
y_test=y_log.iloc[2784:]

# Add the end for getting the result back to original like below:
# y = np.exp(y_log) - 1

# Dummy data for train and test (for demonstration)
train = pd.DataFrame(X.iloc[:2784,:], columns=X.columns)
train['Sex'] = np.random.choice(['M', 'F', 'I'], size=len(train))

test = pd.DataFrame(X.iloc[2784:,:], columns=X.columns)
test['Sex'] = np.random.choice(['M', 'F', 'I'], size=len(test))

# OneHotEncoding the 'Sex' column for both train and test datasets
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

train_encoded = pd.concat([
                    train.iloc[:, 1:].reset_index(drop=True),  # Exclude the original 'Sex' column
                    pd.DataFrame(encoder.fit_transform(train[['Sex']]).astype('int'),
                                 columns=encoder.categories_[0])
                    ],
                    axis=1
                )

test_encoded = pd.concat([
                    test.iloc[:, 1:].reset_index(drop=True),  # Exclude the original 'Sex' column
                    pd.DataFrame(encoder.transform(test[['Sex']]).astype('int'),
                                 columns=encoder.categories_[0]) ],
                    axis=1
                )

# Now, 'train_encoded' and 'test_encoded' contain the encoded 'Sex' column.

def log_transformation(data, columns):
    for column in columns:
        positive_values = data[column] - data[column].min() + 1
        data[f'{column}_log'] = np.log(positive_values)
    return data


if False:
    train = log_transformation(train_encoded, ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight','Viscera_weight', 'Shell_weight'])
    test  = log_transformation(test_encoded, ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight','Viscera_weight', 'Shell_weight'])

In [None]:
test_x, test_y=np.asarray(test_encoded),np.asarray(y_test)
train_x, train_y = np.asarray(train_encoded),np.asarray(y_train)

scaler = MinMaxScaler(feature_range=(0, 1)) #StandardScaler()
traindata = scaler.fit_transform(train_x)

scaler_y = MinMaxScaler(feature_range=(0, 1)) #StandardScaler()
trainlabel= scaler_y.fit_transform(train_y.reshape(-1,1))
testlabel= scaler_y.transform(test_y.reshape(-1,1))

testdata = scaler.transform(test_x)
test_x, test_y=np.asarray(testdata),np.asarray(testlabel)
train_x, train_y = np.asarray(traindata),np.asarray(trainlabel)

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [None]:
data_numpy = {
    'train': {'x_num': train_x.astype(np.float32), 'y': train_y.astype(np.float32)},
    'val': {'x_num': val_x.astype(np.float32), 'y': val_y.astype(np.float32)},
    'test': {'x_num': test_x.astype(np.float32), 'y': test_y.astype(np.float32)},
}

In [None]:
np.mean(test_y)

np.float64(0.3149676956209619)

## housing

In [None]:
from keras.datasets import boston_housing
(traindata_o,trainlabel_o ), (testdata_o, testlabel_o)  =boston_housing.load_data()
testdata_o= np.concatenate([testdata_o,traindata_o[337:]],axis=0)
testlabel_o= np.concatenate([testlabel_o,trainlabel_o[337:]],axis=0)
traindata_o=traindata_o[:337]
trainlabel_o=trainlabel_o[:337]
scaler = MinMaxScaler(feature_range=(0, 1))
traindata = scaler.fit_transform(traindata_o)

scaler_y = MinMaxScaler(feature_range=(0, 1))
trainlabel= scaler_y.fit_transform(trainlabel_o.reshape(-1,1))
testlabel= scaler_y.transform(testlabel_o.reshape(-1,1))
testdata = scaler.transform(testdata_o)
test_x, test_y=np.asarray(testdata),np.asarray(testlabel)
train_x, train_y = np.asarray(traindata),np.asarray(trainlabel)

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [None]:
data_numpy = {
    'train': {'x_num': train_x.astype(np.float32), 'y': train_y.astype(np.float32)},
    'val': {'x_num': val_x.astype(np.float32), 'y': val_y.astype(np.float32)},
    'test': {'x_num': test_x.astype(np.float32), 'y': test_y.astype(np.float32)},
}

In [None]:
np.mean(test_y)

np.float64(0.4086522024983563)

## IZMIR

In [None]:
df= pd.read_csv("izmir_df.csv")
df.head()

Unnamed: 0,Max_temperature,Min_temperature,Dewpoint,Precipitation,Sea_level_pressure,Standard_pressure,Visibility,Wind_speed,Max_wind_speed,Mean_temperature
0,88.2,57.2,53.6,0.0,29.96,7.3,9.09,16.1,34.28,74.3
1,88.0,58.6,54.9,0.0,29.84,7.3,10.7,18.3,34.28,75.2
2,91.6,62.1,60.4,0.0,29.76,7.2,8.29,18.3,34.28,76.1
3,64.4,42.8,37.4,0.2,30.15,7.8,21.1,27.5,34.28,47.1
4,94.1,72.3,46.8,0.0,29.86,7.2,17.2,25.3,34.28,83.9


In [None]:
from sklearn.model_selection import train_test_split

# Supongamos que ya tienes el DataFrame cargado en 'df'
# Definir tamaños específicos para train y test
train_size = 974
test_size = 487

# Dividir los datos
train_x, test_x,train_y,test_y = train_test_split(df.iloc[:,:-1],df.iloc[:,-1], train_size=train_size, test_size=test_size, random_state=42)
train_x, test_x= train_x.values, test_x.values
train_y,test_y= train_y.values.reshape(-1,1),test_y.values.reshape(-1,1)

scaler = MinMaxScaler(feature_range=(0, 1)) #StandardScaler() #StandardScaler(feature_range=(0, 1)) #StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

scaler_y = MinMaxScaler(feature_range=(0, 1)) #StandardScaler()
train_y= scaler_y.fit_transform(train_y)
test_y= scaler_y.transform(test_y)

print(train_x.shape, train_y.shape,test_x.shape, test_y.shape)

(974, 9) (974, 1) (487, 9) (487, 1)


In [None]:

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [None]:
data_numpy = {
    'train': {'x_num': train_x.astype(np.float32), 'y': train_y.astype(np.float32)},
    'val': {'x_num': val_x.astype(np.float32), 'y': val_y.astype(np.float32)},
    'test': {'x_num': test_x.astype(np.float32), 'y': test_y.astype(np.float32)},
}

## MNIST

In [None]:
''' For Keras dataset_load()'''
import tensorflow.keras as keras
(traindata_o, trainlabel), (testdata_o, testlabel) = keras.datasets.mnist.load_data()
traindata_o = traindata_o.reshape(traindata_o.shape[0], 28*28).astype('float64')/255
trainlabel = keras.utils.to_categorical(trainlabel, 10)
testdata_o = testdata_o.reshape(testdata_o.shape[0], 28*28).astype('float64')/255
testlabel = keras.utils.to_categorical(testlabel, 10)
print(traindata_o.shape, trainlabel.shape, testdata_o.shape, testlabel.shape )

#scaler = StandardScaler()
#traindata = scaler.fit_transform(traindata_o)
#if task== "regression":
#        scaler_y = StandardScaler()
#        self.train_y= scaler_y.fit_transform(self.train_y)
#        self.test_y= scaler_y.fit_transform(self.test_y)
#inference process
#testdata = scaler.transform(testdata_o)

test_x, test_y=np.asarray(testdata_o),np.asarray(testlabel)
train_x, train_y = np.asarray(traindata_o),np.asarray(trainlabel)

(60000, 784) (60000, 10) (10000, 784) (10000, 10)


In [None]:
train_y=np.argmax(train_y, axis=1)
test_y= np.argmax(test_y, axis=1)

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Coil100

In [None]:
X=np.load("coil100.npy", mmap_mode='r')/255
print(X.shape)
y=np.load("coil100_y.npy", mmap_mode='r')
print(y.shape)

(7200, 32, 32)
(7200,)


In [None]:
y=y-1
np.max(y)

np.int64(99)

In [None]:
from sklearn.model_selection import train_test_split
X= X.reshape(7200, 32*32)

# Split the dataset into training and test sets
train_x, test_x,train_y,test_y = train_test_split(X,y, test_size=2200, train_size=5000, shuffle=True, random_state=42)

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Isolet

In [None]:
labels = np.load('isolet_labels.npy')
features = np.load('isolet_features.npy')
labels=labels-1
np.max(labels)

np.uint8(25)

In [None]:
train_x, test_x,train_y,test_y = train_test_split(features,labels ,train_size=1092, random_state=42)


In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

# Data preprocessing

In [None]:
data_numpy = {
    'train': {'x_num': train_x.astype(np.float32), 'y': train_y}, #.astype(np.float32)
    'val': {'x_num': val_x.astype(np.float32), 'y': val_y},
    'test': {'x_num': test_x.astype(np.float32), 'y': test_y},
}

In [None]:
task_type="classification"
n_classes = 26 #100 #10
cat_cardinalities = []

In [None]:
task_type="regression"
n_classes = None
cat_cardinalities = []

In [None]:
#without additional preprocessing
Y_train = data_numpy['train']['y'].copy()
regression_label_stats = None

In [None]:
# Feature preprocessing.
# NOTE
# The choice between preprocessing strategies depends on a task and a model.

# Simple preprocessing strategy.
# preprocessing = sklearn.preprocessing.StandardScaler().fit(
#     data_numpy['train']['x_num']
# )

# Advanced preprocessing strategy.
# The noise is added to improve the output of QuantileTransformer in some cases.
x_num_train_numpy = data_numpy['train']['x_num']
noise = (
    np.random.default_rng(0)
    .normal(0.0, 1e-5, x_num_train_numpy.shape)
    .astype(x_num_train_numpy.dtype)
)
preprocessing = sklearn.preprocessing.QuantileTransformer(
    n_quantiles=max(min(x_num_train_numpy.shape[0] // 30, 1000), 10),
    output_distribution='normal',
    subsample=10**9,
).fit(x_num_train_numpy + noise)
del x_num_train_numpy

# Apply the preprocessing.
for part in data_numpy:
    data_numpy[part]['x_num'] = preprocessing.transform(data_numpy[part]['x_num'])


# Label preprocessing.
class RegressionLabelStats(NamedTuple):
    mean: float
    std: float


Y_train = data_numpy['train']['y'].copy()
if task_type == 'regression':
    # For regression tasks, it is highly recommended to standardize the training labels.
    regression_label_stats = RegressionLabelStats(
        Y_train.mean().item(), Y_train.std().item()
    )
    Y_train = (Y_train - regression_label_stats.mean) / regression_label_stats.std
else:
    regression_label_stats = None

#  PyTorch settings

In [None]:
# Device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Convert data to tensors
data = {
    part: {k: torch.as_tensor(v, device=device) for k, v in data_numpy[part].items()}
    for part in data_numpy
}
Y_train = torch.as_tensor(Y_train, device=device)
if task_type == 'regression':
    for part in data:
        data[part]['y'] = data[part]['y'].float()
    Y_train = Y_train.float()

# Automatic mixed precision (AMP)
# torch.float16 is implemented for completeness,
# but it was not tested in the project,
# so torch.bfloat16 is used by default.
amp_dtype = (
    torch.bfloat16
    if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    else torch.float16
    if torch.cuda.is_available()
    else None
)
# Changing False to True can speed up training
# of large enough models on compatible hardware.
amp_enabled = False and amp_dtype is not None
grad_scaler = torch.cuda.amp.GradScaler() if amp_dtype is torch.float16 else None  # type: ignore

# torch.compile
compile_model = False

# fmt: off
print(f'Device:        {device.type.upper()}')
print(f'AMP:           {amp_enabled}{f" ({amp_dtype})"if amp_enabled else ""}')
print(f'torch.compile: {compile_model}')
# fmt: on

Device:        CUDA
AMP:           False
torch.compile: False


# Model and optimizer

The best performance is usually achieved with `num_embeddings`
from the `rtdl_num_embeddings` package. Typically, `PiecewiseLinearEmbeddings`
and `PeriodicEmbeddings` perform best.

In [None]:
n_num_features=train_x.shape[1]
# No embeddings.
num_embeddings = None

In [None]:


# Simple embeddings. classification
num_embeddings = rtdl_num_embeddings.LinearReLUEmbeddings(n_num_features)

# Periodic embeddings.
#num_embeddings = rtdl_num_embeddings.PeriodicEmbeddings(n_num_features, lite=False)

# Piecewise-linear embeddings. regression
#num_embeddings = rtdl_num_embeddings.PiecewiseLinearEmbeddings(
#    rtdl_num_embeddings.compute_bins(data['train']['x_num'], n_bins=48),
#    d_embedding=16,
#    activation=False,
#    version='B',
#)

In [None]:


model = tabm.TabM.make(
    n_num_features=n_num_features,
    cat_cardinalities=cat_cardinalities,
    d_out=1 if n_classes is None else n_classes,
    num_embeddings=num_embeddings,
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=3e-4)
gradient_clipping_norm: Optional[float] = 1.0

if compile_model:
    # NOTE
    # `torch.compile(model, mode="reduce-overhead")` caused issues during training,
    # so the `mode` argument is not used.
    model = torch.compile(model)
    evaluation_mode = torch.no_grad
else:
    evaluation_mode = torch.inference_mode

In [None]:
print(model)
print(f'Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')


#'k': 32,

#mnist sin preprocess

TabM(
  (ensemble_view): EnsembleView()
  (backbone): MLPBackboneBatchEnsemble(
    (blocks): ModuleList(
      (0-2): 3 x Sequential(
        (0): LinearBatchEnsemble()
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (output): LinearEnsemble()
)
Total parameters: 1,246,016


In [None]:
print(model)
print(f'Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')


#'k': 32,

#mnist con preprocess

TabM(
  (num_module): LinearReLUEmbeddings(
    (linear): LinearEmbeddings()
    (activation): ReLU()
  )
  (ensemble_view): EnsembleView()
  (backbone): MLPBackboneBatchEnsemble(
    (blocks): ModuleList(
      (0-1): 2 x Sequential(
        (0): LinearBatchEnsemble()
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (output): LinearEnsemble()
)
Total parameters: 14,206,272


# Training

In [None]:
# A quick reminder: TabM represents an ensemble of k MLPs.
#
# The option below determines if the MLPs are trained
# on the same batches (share_training_batches=True) or
# on different batches. Technically, this option determines:
# - How the loss function is implemented.
# - How the training batches are constructed.
#
# `True` is recommended by default because of better training efficiency.
# On some tasks, `False` may provide better performance.
share_training_batches = True
task_is_regression = task_type == 'regression'

In [None]:
@torch.autocast(device.type, enabled=amp_enabled, dtype=amp_dtype)  # type: ignore[code]
def apply_model(part: str, idx: Tensor) -> Tensor:
    return (
        model(
            data[part]['x_num'][idx],
            data[part]['x_cat'][idx] if 'x_cat' in data[part] else None,
        )
        .squeeze(-1)  # Remove the last dimension for regression tasks.
        .float()
    )


base_loss_fn = (
    nn.functional.mse_loss if task_is_regression else nn.functional.cross_entropy
)


def loss_fn(y_pred: Tensor, y_true: Tensor) -> Tensor:
    # TabM produces k predictions. Each of them must be trained separately.

    # Regression:     (batch_size, k)            -> (batch_size * k,)
    # Classification: (batch_size, k, n_classes) -> (batch_size * k, n_classes)
    y_pred = y_pred.flatten(0, 1)

    if share_training_batches:
        # (batch_size,) -> (batch_size * k,)
        y_true = y_true.repeat_interleave(model.backbone.k)
    else:
        # (batch_size, k) -> (batch_size * k,)
        y_true = y_true.flatten(0, 1)

    return base_loss_fn(y_pred, y_true)

@evaluation_mode()
def evaluate(part: str) -> float:
    model.eval()

    # When using torch.compile, you may need to reduce the evaluation batch size.
    eval_batch_size = 8096
    y_pred: np.ndarray = (
        torch.cat(
            [
                apply_model(part, idx)
                for idx in torch.arange(len(data[part]['y']), device=device).split(eval_batch_size)
            ]
        )
        .cpu()
        .numpy()
    )
    if task_type == 'regression':
        # Transform the predictions back to the original label space.
        if regression_label_stats is not None:
          y_pred = y_pred * regression_label_stats.std + regression_label_stats.mean

    # Compute the mean of the k predictions.
    if not task_is_regression:
        # For classification, the mean must be computed in the probability space.
        y_pred = scipy.special.softmax(y_pred, axis=-1)
    y_pred = y_pred.mean(1)

    y_true = data[part]['y'].cpu().numpy()
    #if part=="test":
      #print("test: ", y_true[:10])
      #print("pred: ", y_pred[:10])
    #print("mean y_pred_test: ", y_pred)
    #print(y_true)
    y_pred=np.argmax(y_pred,axis=1)
    #print(y_pred)
    score = (
        -(sklearn.metrics.mean_squared_error(y_true, y_pred)**0.5 )
        if task_type == 'regression'
        else  f1_score(y_true, y_pred, average="weighted")
        #sklearn.metrics.accuracy_score(y_true, y_pred.argmax(1))
        )
    return float(score)  # The higher -- the better.


print(f'Test score before training: {evaluate("test"):.4f}')

Test score before training: 0.0044


In [None]:
l_mlp=[]
for _ in range(10):
  n_epochs = 100
  train_size = train_x.shape[0]
  batch_size = 128 # Reduced batch size
  epoch_size = math.ceil(train_size / batch_size)

  epoch = -1
  metrics = {'val': -math.inf, 'test': -math.inf}


  def make_checkpoint() -> dict[str, Any]:
      return deepcopy(
          {
              'model': model.state_dict(),
              'optimizer': optimizer.state_dict(),
              'epoch': epoch,
              'metrics': metrics,
          }
      )


  best_checkpoint = make_checkpoint()

  # Early stopping: the training stops if the validation score
  # does not improve for more than `patience` consecutive epochs.
  patience = 16
  remaining_patience = patience

  for epoch in range(n_epochs):
      batches = (
          # Create one standard batch sequence.
          torch.randperm(train_size, device=device).split(batch_size)
          if share_training_batches
          # Create k independent batch sequences.
          else (
              torch.rand((train_size, model.backbone.k), device=device)
              .argsort(dim=0)
              .split(batch_size, dim=0)
          )
      )
      for batch_idx in batches:
          model.train()
          optimizer.zero_grad()
          loss = loss_fn(apply_model('train', batch_idx), Y_train[batch_idx])
          if gradient_clipping_norm is not None:
              if grad_scaler is not None:
                  grad_scaler.unscale_(optimizer)
              torch.nn.utils.clip_grad.clip_grad_norm_(
                  model.parameters(), gradient_clipping_norm
              )
          if grad_scaler is None:
              loss.backward()
              optimizer.step()
          else:
              grad_scaler.scale(loss).backward()  # type: ignore
              grad_scaler.step(optimizer)
              grad_scaler.update()

      metrics = {part: evaluate(part) for part in ['val', 'test']}
      val_score_improved = metrics['val'] > best_checkpoint['metrics']['val']

      print(
          f'{"*" if val_score_improved else " "}'
          f' [epoch] {epoch:<3}'
          f' [val] {metrics["val"]:.3f}'
          f' [test] {metrics["test"]:.3f}'
      )

      if val_score_improved:
          best_checkpoint = make_checkpoint()
          remaining_patience = patience
      else:
          remaining_patience -= 1

      if remaining_patience < 0:
          break

  # To make final predictions, load the best checkpoint.
  model.load_state_dict(best_checkpoint['model'])

  print('\n[Summary]')
  print(f'best epoch:  {best_checkpoint["epoch"]}')
  print(f'val score:  {best_checkpoint["metrics"]["val"]}')
  print(f'test score: {best_checkpoint["metrics"]["test"]}')
  l_mlp.append(-best_checkpoint["metrics"]["test"])

* [epoch] 0   [val] 0.186 [test] 0.163
* [epoch] 1   [val] 0.511 [test] 0.493
* [epoch] 2   [val] 0.597 [test] 0.601
* [epoch] 3   [val] 0.687 [test] 0.718
* [epoch] 4   [val] 0.798 [test] 0.859
* [epoch] 5   [val] 0.827 [test] 0.892
* [epoch] 6   [val] 0.877 [test] 0.886
  [epoch] 7   [val] 0.871 [test] 0.893
* [epoch] 8   [val] 0.935 [test] 0.919
  [epoch] 9   [val] 0.913 [test] 0.927
  [epoch] 10  [val] 0.931 [test] 0.930
  [epoch] 11  [val] 0.912 [test] 0.936
* [epoch] 12  [val] 0.936 [test] 0.929
* [epoch] 13  [val] 0.950 [test] 0.934
  [epoch] 14  [val] 0.945 [test] 0.938
* [epoch] 15  [val] 0.964 [test] 0.942
  [epoch] 16  [val] 0.945 [test] 0.949
  [epoch] 17  [val] 0.950 [test] 0.940
  [epoch] 18  [val] 0.932 [test] 0.938
  [epoch] 19  [val] 0.960 [test] 0.940
  [epoch] 20  [val] 0.955 [test] 0.945
  [epoch] 21  [val] 0.950 [test] 0.945
  [epoch] 22  [val] 0.954 [test] 0.941
  [epoch] 23  [val] 0.959 [test] 0.943
  [epoch] 24  [val] 0.950 [test] 0.945
  [epoch] 25  [val] 0.959

In [None]:
import subprocess
import re

def get_gpu_info():
    try:
        result = subprocess.check_output(
            ['nvidia-smi', '--query-gpu=name,memory.total,driver_version,compute_cap', '--format=csv,noheader,nounits'],
            encoding='utf-8'
        )
        gpus = result.strip().split('\n')
        for i, gpu in enumerate(gpus):
            name, memory, driver, compute = [x.strip() for x in gpu.split(',')]
            print(f"GPU {i}:")
            print(f"  Model: {name}")
            print(f"  Memory: {memory} MB")
            print(f"  Driver Version: {driver}")
            print(f"  Compute Capability: {compute}")
            print()
    except FileNotFoundError:
        print("nvidia-smi no está disponible. ¿Tienes drivers NVIDIA instalados?")
    except Exception as e:
        print("Error obteniendo información de la GPU:", e)

get_gpu_info()


GPU 0:
  Model: Tesla T4
  Memory: 15360 MB
  Driver Version: 550.54.15
  Compute Capability: 7.5



MNIST: TabM fair comparison: 24 min

mnist con preprocessing  CUDA out of memory. Tried to allocate 24.21 GiB

In [None]:
#housing embeddings and preprocessing
print("TabM: ", np.round(np.min(l_mlp),5), np.round(np.mean(l_mlp),4), " +- ", np.round(np.std(l_mlp),4) )

TabM:  0.06595 0.0705  +-  0.0049


In [None]:
#housing fair comparison
print("TabM: ", np.round(np.min(l_mlp),5), np.round(np.mean(l_mlp),4), " +- ", np.round(np.std(l_mlp),4) )

TabM:  0.09182 0.0953  +-  0.0022


In [None]:
#abalone fair comparison
print("TabM: ", np.round(np.min(l_mlp),5), np.round(np.mean(l_mlp),4), " +- ", np.round(np.std(l_mlp),4) )

TabM:  0.06784 0.0686  +-  0.0007


In [None]:
#abalone embeddings and preprocessing
print("TabM: ", np.round(np.min(l_mlp),5), np.round(np.mean(l_mlp),4), " +- ", np.round(np.std(l_mlp),4) )

TabM:  0.06887 0.0694  +-  0.0003


In [None]:
#izmir fair comparison
print("TabM: ", np.round(np.min(l_mlp),5), np.round(np.mean(l_mlp),4), " +- ", np.round(np.std(l_mlp),4) )

TabM:  0.01838 0.0187  +-  0.0002


In [None]:
#izmir embeddings and preprocessing
print("TabM: ", np.round(np.min(l_mlp),5), np.round(np.mean(l_mlp),4), " +- ", np.round(np.std(l_mlp),4) )

TabM:  0.01853 0.0192  +-  0.0006


In [None]:
#MNISTfair comparison
print(np.round(100* np.mean(l_mlp),2), " +- ", np.round(100*np.std(l_mlp),2))
# with embeddings requieres more than 24 gb of VRAM

-98.45  +-  0.05


In [None]:
#coil 100 fair comparison
print(np.round(100* np.mean(l_mlp),2), " +- ", np.round(100*np.std(l_mlp),2))
# with embeddings requieres more than 24 gb of VRAM

-98.81  +-  0.11


In [None]:
#isolet fair comparison
print(np.round(100* np.mean(l_mlp),2), " +- ", np.round(100*np.std(l_mlp),2))
# with embeddings requieres more than 24 gb of VRAM

-94.6  +-  0.35
