In [30]:
from typing import Any, Dict, List

from torch import Tensor
from torch.nn import Linear, Module, ModuleList

import torch_frame
from torch_frame import TensorFrame, stype
from torch_frame.data.stats import StatType
from torch_frame.nn.conv import TabTransformerConv
from torch_frame.nn.encoder import (
    EmbeddingEncoder,
    LinearEncoder,
    StypeWiseFeatureEncoder,
)


class ExampleTransformer(Module):
    def __init__(
        self,
        channels: int,
        out_channels: int,
        num_layers: int,
        num_heads: int,
        col_stats: Dict[str, Dict[StatType, Any]],
        col_names_dict: Dict[torch_frame.stype, List[str]],
    ):
        super().__init__()
        self.encoder = StypeWiseFeatureEncoder(
            out_channels=channels,
            col_stats=col_stats,
            col_names_dict=col_names_dict,
            stype_encoder_dict={
                stype.categorical: EmbeddingEncoder(),
                stype.numerical: LinearEncoder()
            },
        )
        self.tab_transformer_convs = ModuleList([
            TabTransformerConv(
                channels=channels,
                num_heads=num_heads,
            ) for _ in range(num_layers)
        ])
        self.decoder = Linear(channels, out_channels)

    def forward(self, tf: TensorFrame) -> Tensor:
        x, _ = self.encoder(tf)
        for tab_transformer_conv in self.tab_transformer_convs:
            x = tab_transformer_conv(x)
        out = self.decoder(x.mean(dim=1))
        return out

In [31]:
from torch_frame.datasets import Yandex
from torch_frame.data import DataLoader

dataset = Yandex(root='/tmp/adult', name='adult')
dataset.materialize()
dataset.shuffle()
train_dataset, test_dataset = dataset[:0.8], dataset[0.80:]
train_loader = DataLoader(train_dataset.tensor_frame, batch_size=128,
                        shuffle=True)
test_loader = DataLoader(test_dataset.tensor_frame, batch_size=128)

In [33]:
import torch
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ExampleTransformer(
    channels=32,
    out_channels=dataset.num_classes,
    num_layers=2,
    num_heads=8,
    col_stats=train_dataset.col_stats,
    col_names_dict=train_dataset.tensor_frame.col_names_dict,
).to(device)

optimizer = torch.optim.Adam(model.parameters())

for epoch in range(50):
    print(epoch)
    for tf in train_loader:
        tf = tf.to(device)
        pred = model(tf)
        loss = F.cross_entropy(pred, tf.y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [34]:
tf

TensorFrame(
  num_cols=14,
  num_rows=128,
  categorical (8): ['C_feature_0', 'C_feature_1', 'C_feature_2', 'C_feature_3', 'C_feature_4', 'C_feature_5', 'C_feature_6', 'C_feature_7'],
  numerical (6): ['N_feature_0', 'N_feature_1', 'N_feature_2', 'N_feature_3', 'N_feature_4', 'N_feature_5'],
  has_target=True,
  device='cuda:0',
)

In [5]:
model.eval()
correct = 0
for tf in test_loader:
    tf = tf.to(device)
    pred = model(tf)
    pred_class = pred.argmax(dim=-1)
    correct += (tf.y == pred_class).sum()
acc = int(correct) / len(test_dataset)
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.8394


In [25]:
import random
import torch_frame
from torch_frame.data import Dataset
from torch_frame import TensorFrame, stype
import numpy as np
import pandas as pd

# Numerical column
numerical = np.random.randint(0, 100, size=100)

# Categorical column
simple_categories = ['Type 1', 'Type 2', 'Type 3']
categorical = np.random.choice(simple_categories, size=100)

# Timestamp column
time = pd.date_range(start='2023-01-01', periods=100, freq='D')

# Multicategorical column
categories = ['Category A', 'Category B', 'Category C', 'Category D']
multicategorical = [
    random.sample(categories, k=random.randint(0, len(categories)))
    for _ in range(100)
]

# Embedding column (assuming an embedding size of 5 for simplicity)
embedding_size = 5
embedding = np.random.rand(100, embedding_size)

# Create the DataFrame
df = pd.DataFrame({
    'Numerical': numerical,
    'Categorical': categorical,
    # 'Time': time,
    'Multicategorical': multicategorical,
    # 'Embedding': list(embedding)
})

df["Multicategorical"] = [list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list([]), list(['Slab']), list(['Slab on Builders Pier']),
       list(['Slab']), list(['Block & Beam']), list(['Slab']),
       list(['Pier & Beam']), list(['Slab']), list(['Slab']),
       list(['Slab']), list([]), list(['Block & Beam']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Pier & Beam']),
       list(['Slab']), list(['Pier & Beam']), list(['Slab']),
       list(['Block & Beam']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Pier & Beam']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Block & Beam']), list(['Slab']), list(['Slab']),
       list(['Pier & Beam']), list(['Slab']), list(['Block & Beam']),
       list(['Slab']), list(['Slab on Builders Pier']), list([]),
       list(['Slab']), list(['Slab']), list(['Slab on Builders Pier']),
       list(['Slab']), list(['Slab']), list(['Pier & Beam']),
       list(['Slab']), list(['Pier & Beam']), list(['Pier & Beam']),
       list(['Slab']), list(['Slab']), list(['Slab']), list([]),
       list(['Slab']), list(['Slab']), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab']), list(['Slab']), list([]),
       list(['Slab']), list([]), list(['Slab']), list(['Slab']),
       list(['Slab']), list(['Slab'])]

df.head()

Unnamed: 0,Numerical,Categorical,Multicategorical
0,59,Type 3,[Slab]
1,38,Type 3,[Slab]
2,80,Type 3,[Slab]
3,43,Type 1,[Slab]
4,97,Type 1,[Slab]


In [26]:
dataset = Dataset(
    df, col_to_stype={
        'Numerical': stype.numerical,
        'Categorical': stype.categorical,
        # 'Time': stype.timestamp,
        'Multicategorical': stype.multicategorical,
        # 'Embedding': stype.embedding
    })
dataset.materialize()

Dataset()

In [19]:
dataset.tensor_frame

TensorFrame(
  num_cols=5,
  num_rows=100,
  numerical (1): ['Numerical'],
  categorical (1): ['Categorical'],
  timestamp (1): ['Time'],
  multicategorical (1): ['Multicategorical'],
  embedding (1): ['Embedding'],
  has_target=False,
  device='cpu',
)

In [28]:
df

Unnamed: 0,Numerical,Categorical,Multicategorical
0,59,Type 3,[Slab]
1,38,Type 3,[Slab]
2,80,Type 3,[Slab]
3,43,Type 1,[Slab]
4,97,Type 1,[Slab]
...,...,...,...
95,77,Type 1,[]
96,67,Type 2,[Slab]
97,20,Type 1,[Slab]
98,65,Type 2,[Slab]
