### Imports

In [53]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.init as init
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

import os

import arff
import pandas as pd
from pathlib import Path

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from typing import Optional, List, Dict

In [54]:
import sys
import os

# Add current folder to sys.path so Python sees "tabicl"
sys.path.insert(0, os.path.abspath("."))

from tabicl.src.tabicl.model.tabicl import TabICL
from tabicl.src.tabicl.sklearn.classifier import TabICLClassifier
from tabicl.src.tabicl.sklearn.preprocessing import (
    TransformToNumerical,
    EnsembleGenerator,
)

### Load and process dataset

In [55]:
with open("data.arff") as f:
    dataset = arff.load(f)

# dataset['data'] is a generator, convert it to a list
data_list = list(dataset["data"])
# Convert to DataFrame
df = pd.DataFrame(dataset["data"], columns=[attr[0] for attr in dataset["attributes"]])
# df = pd.DataFrame(dataset["data"], columns=[attr[0] for attr in dataset["attributes"]])
print(df.head())

        V2       V3   V4 V5      V6      V7         V8       V9  \
0     Sexy      Low  4.6  M  Summer  o-neck  sleevless   empire   
1   Casual      Low  0.0  L  Summer  o-neck      Petal  natural   
2  vintage     High  0.0  L  Automn  o-neck       full  natural   
3    Brief  Average  4.6  L  Spring  o-neck       full  natural   
4     cute      Low  4.5  M  Summer  o-neck  butterfly  natural   

             V10      V11         V12     V13 Class  
0           None  chiffon     ruffles  animal     2  
1     microfiber     None     ruffles  animal     1  
2       polyster     None        None   print     1  
3           silk  chiffon  embroidary   print     2  
4  chiffonfabric  chiffon         bow     dot     1  


In [56]:
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of rows: 500
Number of columns: 13


In [57]:
# Separate features and target
X = df.drop(columns=["Class"])
y = df["Class"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

print("Classes: ", y.unique())

Train: (400, 12) (400,)
Val:   (50, 12) (50,)
Test:  (50, 12) (50,)
Classes:  ['2' '1']


### Data processing for finetuning

In [58]:
y_encoder_ = LabelEncoder()
y_train_pr = y_encoder_.fit_transform(y_train)
y_val_pr = y_encoder_.fit_transform(y_val)
classes_ = y_encoder_.classes_
n_classes_ = len(y_encoder_.classes_)

#  Transform input features
X_encoder_ = TransformToNumerical(verbose=False)
X_train_pr = X_encoder_.fit_transform(X_train)
X_val_pr = X_encoder_.fit_transform(X_val)

n_estimators: int = (32,)
norm_methods: Optional[str | List[str]] = (None,)
feat_shuffle_method: str = ("latin",)
class_shift: bool = (True,)
outlier_threshold: float = (4.0,)
softmax_temperature: float = (0.9,)
average_logits: bool = (True,)
use_hierarchical: bool = True
random_state: int | None = (42,)

seed = random_state if isinstance(random_state, (int, type(None))) else None

# Fit ensemble generator to create multiple dataset views
ensemble_generator_ = EnsembleGenerator(
    n_estimators=32,
    norm_methods=["none", "power"],
    feat_shuffle_method="latin",
    class_shift=class_shift,
    outlier_threshold=outlier_threshold,
    random_state=seed,
)
ensemble_generator_.fit(X_train_pr, y_train_pr)
ensemble_generator_.fit(X_val_pr, y_val_pr)


In [59]:
X_train_tensor = torch.tensor(X_train_pr, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_pr, dtype=torch.long)

# X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# X_train_tensor = X_train_tensor.unsqueeze(0)
# y_train_tensor = y_train_tensor.unsqueeze(0)

print("X train tensor shape: ", X_train_tensor.shape)
print("y train tensor shape: ", y_train_tensor.shape)

X_val_tensor = torch.tensor(X_val_pr, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_pr, dtype=torch.long)

# X_val_tensor = X_val_tensor.unsqueeze(0)
# y_val_tensor = y_val_tensor.unsqueeze(0)

print("X train tensor shape: ", X_val_tensor.shape)
print("y train tensor shape: ", y_val_tensor.shape)

X train tensor shape:  torch.Size([400, 12])
y train tensor shape:  torch.Size([400])
X train tensor shape:  torch.Size([50, 12])
y train tensor shape:  torch.Size([50])


### Load model and weights

In [20]:
model = TabICL()


def cnt_params(model):
    return sum(param.numel() for param in model.parameters())


print("Number of parameters: ", cnt_params(model))

Number of parameters:  27051666


In [10]:
model.col_embedder

ColEmbedding(
  (in_linear): SkippableLinear(in_features=1, out_features=128, bias=True)
  (tf_col): SetTransformer(
    (blocks): ModuleList(
      (0-2): 3 x InducedSelfAttentionBlock(
        (multihead_attn1): MultiheadAttentionBlock(
          (linear1): Linear(in_features=128, out_features=256, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (linear2): Linear(in_features=256, out_features=128, bias=True)
          (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.0, inplace=False)
          (dropout2): Dropout(p=0.0, inplace=False)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
        )
        (multihead_attn2): MultiheadAttentionBlock(
          (linear1): Linear(in_features=128, out_features=256, bias=True)
          (dropout): Dro

In [11]:
for name, module in model.named_modules():
    params = sum(p.numel() for p in module.parameters() if p.requires_grad)
    if params > 0:
        print(f"Module: {name} | Params: {params}")

Module:  | Params: 27051658
Module: col_embedder | Params: 877824
Module: col_embedder.in_linear | Params: 256
Module: col_embedder.tf_col | Params: 844032
Module: col_embedder.tf_col.blocks | Params: 844032
Module: col_embedder.tf_col.blocks.0 | Params: 281344
Module: col_embedder.tf_col.blocks.0.multihead_attn1 | Params: 132480
Module: col_embedder.tf_col.blocks.0.multihead_attn1.linear1 | Params: 33024
Module: col_embedder.tf_col.blocks.0.multihead_attn1.linear2 | Params: 32896
Module: col_embedder.tf_col.blocks.0.multihead_attn1.norm1 | Params: 256
Module: col_embedder.tf_col.blocks.0.multihead_attn1.norm2 | Params: 256
Module: col_embedder.tf_col.blocks.0.multihead_attn1.attn | Params: 66048
Module: col_embedder.tf_col.blocks.0.multihead_attn1.attn.out_proj | Params: 16512
Module: col_embedder.tf_col.blocks.0.multihead_attn2 | Params: 132480
Module: col_embedder.tf_col.blocks.0.multihead_attn2.linear1 | Params: 33024
Module: col_embedder.tf_col.blocks.0.multihead_attn2.linear2 | P

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BASE_DIR = Path().resolve()
checkpoint_path = BASE_DIR / "tabicl-classifier-v1.1-0506.ckpt"
checkpoint = torch.load(checkpoint_path, map_location="cpu")
model.load_state_dict(checkpoint["state_dict"])
model.to(device)

  checkpoint = torch.load(checkpoint_path, map_location="cpu")


TabICL(
  (col_embedder): ColEmbedding(
    (in_linear): SkippableLinear(in_features=1, out_features=128, bias=True)
    (tf_col): SetTransformer(
      (blocks): ModuleList(
        (0-2): 3 x InducedSelfAttentionBlock(
          (multihead_attn1): MultiheadAttentionBlock(
            (linear1): Linear(in_features=128, out_features=256, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (linear2): Linear(in_features=256, out_features=128, bias=True)
            (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.0, inplace=False)
            (dropout2): Dropout(p=0.0, inplace=False)
            (attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
            )
          )
          (multihead_attn2): MultiheadAttentionBlock(
            (linear1): Linear(in_featu

### Fine-tuning

In [60]:
def evaluate(model, X_val, y_val, train_len, micro_d_val, device):
    model.eval()
    with torch.no_grad():
        y_train = y_val[:, :16].to(device)
        y_test = y_val[:, 16:].to(device)
        pred = model(X_val, y_train, micro_d_val)
        pred = pred.flatten(end_dim=-2)
        true = y_test.long().flatten()
        _, predicted = pred.max(1)
        acc = (predicted == true).float().mean().item()
    return acc

In [64]:
def train(
    model,
    X_train,
    y,
    train_len,
    micro_d,
    X_val,
    y_val,
    micro_d_val,
    train_len_val,
    device,
    config,
    learning_rate=5e-5,
    epochs=40,
    save_path="checkpoints",
):
    os.makedirs(save_path, exist_ok=True)

    model.to(device)
    criterion = torch.nn.CrossEntropyLoss()
    # optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(
        optimizer, mode="min", factor=0.5, patience=5, verbose=True
    )

    train_losses = []
    train_accuracies = []
    best_loss = float("inf")

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        y_train = y[:, :train_len].to(device)
        y_test = y[:, train_len:].to(device)
        X_train = X_train.to(device)
        micro_d = micro_d.to(device)

        pred = model(X_train, y_train, micro_d)  # (B, test_size, max_classes)
        pred = pred.flatten(end_dim=-2)
        true = y_test.long().flatten()

        loss = criterion(pred, true)
        loss.backward()
        optimizer.step()
        scheduler.step(loss)

        _, predicted = pred.max(1)
        accuracy = (predicted == true).sum().item() / true.size(0)

        train_losses.append(loss.item())
        train_accuracies.append(accuracy)

        print(
            f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, "
            f"lr: {optimizer.param_groups[0]['lr']:.6f}, Train Accuracy: {accuracy:.6f}"
        )

        # ---- VALIDATION evaluation ----
        model.eval()
        with torch.no_grad():
            val_acc = evaluate(model, X_val, y_val, micro_d_val, train_len_val, device)

        print(
            f"Epoch {epoch + 1}/{epochs}, "
            f"Loss: {loss.item():.4f}, "
            f"Val Acc: {val_acc:.6f}, "
            f"lr: {optimizer.param_groups[0]['lr']:.6f}"
        )

        # ---- Save best checkpoint ----
        if loss.item() < best_loss:
            best_loss = loss.item()
            checkpoint = {"config": config, "state_dict": model.state_dict()}
            torch.save(checkpoint, os.path.join(save_path, "model_best.pth"))
            print(f"Best model saved (loss={best_loss:.4f})")

    # ---- Save final model ----
    final_ckpt = {"config": config, "state_dict": model.state_dict()}
    torch.save(final_ckpt, os.path.join(save_path, "model_final.pth"))
    print("Final model saved.")

    return train_losses, train_accuracies


In [37]:
d = torch.full((batch_size,), H, dtype=torch.long)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = {
    "max_classes": 10,
    "embed_dim": 128,
    "col_num_blocks": 3,
    "col_nhead": 4,
    "col_num_inds": 128,
    "row_num_blocks": 3,
    "row_nhead": 8,
    "row_num_cls": 4,
    "row_rope_base": 100000,
    "icl_num_blocks": 12,
    "icl_nhead": 4,
    "ff_factor": 2,
    "dropout": 0.0,
    "activation": "gelu",
    "norm_first": True,
}
### ------------------- Run the full finetuning function ---------------------- ###
train_losses, train_accuracies = train(
    model=model,
    X_train=X_train_tensor,
    y=y_train_tensor,
    train_len=300,
    micro_d=d,
    X_val=X_val_tensor,
    y_val=y_val_tensor,
    micro_d_val=d,
    train_len_val=20,
    device=device,
    config=config,
    learning_rate=5e-5,
    epochs=30,
)



Epoch 1/30, Loss: 0.0109, lr: 0.000050, Train Accuracy: 1.000000
Epoch 1/30, Loss: 0.0109, Val Acc: 0.210526, lr: 0.000050
Best model saved (loss=0.0109)
Epoch 2/30, Loss: 0.5759, lr: 0.000050, Train Accuracy: 0.900000
Epoch 2/30, Loss: 0.5759, Val Acc: 0.236842, lr: 0.000050
Epoch 3/30, Loss: 0.4007, lr: 0.000050, Train Accuracy: 0.890000
Epoch 3/30, Loss: 0.4007, Val Acc: 0.263158, lr: 0.000050
Epoch 4/30, Loss: 0.1890, lr: 0.000050, Train Accuracy: 0.940000
Epoch 4/30, Loss: 0.1890, Val Acc: 0.263158, lr: 0.000050
Epoch 5/30, Loss: 0.2018, lr: 0.000050, Train Accuracy: 0.900000
Epoch 5/30, Loss: 0.2018, Val Acc: 0.368421, lr: 0.000050
Epoch 6/30, Loss: 0.1978, lr: 0.000050, Train Accuracy: 0.910000
Epoch 6/30, Loss: 0.1978, Val Acc: 0.368421, lr: 0.000050
Epoch 7/30, Loss: 0.1578, lr: 0.000025, Train Accuracy: 0.940000
Epoch 7/30, Loss: 0.1578, Val Acc: 0.421053, lr: 0.000025
Epoch 8/30, Loss: 0.1085, lr: 0.000025, Train Accuracy: 0.960000
Epoch 8/30, Loss: 0.1085, Val Acc: 0.421053

Clearly, we can observe overfitting. Which is not surprising due to the big size of the model and the small amount of data. 
Another approach would be freezing a major part of the model and fine-tuning only a small part, this could help reduce overfitting.

In [None]:
## Version with mini-batches ##
# def train(
#     model,
#     X_train,
#     y,
#     train_len,
#     micro_d,
#     X_val,
#     y_val,
#     micro_d_val,
#     train_len_val,
#     device,
#     config,
#     learning_rate=5e-5,
#     epochs=40,
#     batch_size=32,
#     save_path="checkpoints",
# ):
#     os.makedirs(save_path, exist_ok=True)

#     model.to(device)
#     criterion = torch.nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
#     scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=5, verbose=True)

#     train_dataset = TensorDataset(X_train, y, micro_d)
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#     train_losses = []
#     train_accuracies = []
#     best_loss = float("inf")

#     for epoch in range(epochs):
#         model.train()
#         epoch_loss = 0.0
#         correct = 0
#         total = 0

#         for X_batch, y_batch, micro_batch in train_loader:
#             X_batch, y_batch, micro_batch = X_batch.to(device), y_batch.to(device), micro_batch.to(device)
#             X_batch = X_batch.unsqueeze(0)
#             y_batch = y_batch.unsqueeze(0)
#             optimizer.zero_grad()
#             # print(y_batch.shape)
#             # print(X_batch.shape)
#             train_len = 16
#             y_train = y_batch[:,:train_len]
#             y_test = y_batch[:,train_len:]

#             pred = model(X_batch, y_train, micro_batch)
#             pred = pred.flatten(end_dim=-2)
#             true = y_test.long().flatten()

#             loss = criterion(pred, true)
#             loss.backward()
#             optimizer.step()

#             epoch_loss += loss.item() * X_batch.size(0)
#             _, predicted = pred.max(1)
#             correct += (predicted == true).sum().item()
#             total += true.size(0)

#             X_batch = X_batch.squeeze(0)
#             y_batch = y_batch.squeeze(0)

#         epoch_loss /= len(train_dataset)
#         accuracy = correct / total
#         train_losses.append(epoch_loss)
#         train_accuracies.append(accuracy)

#         scheduler.step(epoch_loss)

#         print(
#             f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, "
#             f"lr: {optimizer.param_groups[0]['lr']:.6f}, Train Accuracy: {accuracy:.6f}"
#         )

#         # ---- VALIDATION evaluation ----
#         model.eval()
#         with torch.no_grad():
#             X_val = X_val.unsqueeze(0)
#             y_val = y_val.unsqueeze(0)
#             print(y_val.shape)
#             val_acc = evaluate(model, X_val, y_val, micro_d_val, train_len_val, device)
#             X_val = X_val.squeeze(0)
#             y_val = y_val.squeeze(0)

#         print(
#             f"Epoch {epoch + 1}/{epochs}, "
#             f"Loss: {epoch_loss:.4f}, "
#             f"Val Acc: {val_acc:.6f}, "
#             f"lr: {optimizer.param_groups[0]['lr']:.6f}"
#         )

#         # ---- Save best checkpoint ----
#         if epoch_loss < best_loss:
#             best_loss = epoch_loss
#             checkpoint = {"config": config, "state_dict": model.state_dict()}
#             torch.save(checkpoint, os.path.join(save_path, "model_best.pth"))
#             print(f"Best model saved (loss={best_loss:.4f})")

#     # ---- Save final model ----
#     final_ckpt = {"config": config, "state_dict": model.state_dict()}
#     torch.save(final_ckpt, os.path.join(save_path, "model_final.pth"))
#     print("Final model saved.")

#     return train_losses, train_accuracies


### Inference
The classifier expects Data Frames as input.

In [None]:
clf_original = TabICLClassifier(checkpoint_version="original")
clf_original.fit(X_val, y_val)  
y_pred_original = clf_original.predict(X_test)  # in-context learning happens here
# Compute accuracy
acc = accuracy_score(y_test, y_pred_original)
print(f"Baseline Accuracy: {acc:.4f}")

Load checkpoints custom !
Original checkpoint !


  checkpoint = torch.load(checkpoint_path, map_location="cpu")


Baseline Accuracy: 0.5400


In [None]:
clf_original = TabICLClassifier(checkpoint_version="original")
clf_original.fit(X_train, y_train)  
y_pred_original = clf_original.predict(X_test)  # in-context learning happens here
# Compute accuracy
acc = accuracy_score(y_test, y_pred_original)
print(f"Baseline Accuracy: {acc:.4f}")

Load checkpoints custom !
Original checkpoint !


  checkpoint = torch.load(checkpoint_path, map_location="cpu")


Baseline Accuracy: 0.6800


In [None]:
clf_finetuned = TabICLClassifier()
clf_finetuned.fit(X_val, y_val) 
y_pred_finetuned = clf_finetuned.predict(X_test)  # in-context learning happens here
# Compute accuracy
acc = accuracy_score(y_test, y_pred_finetuned)
print(f"Finetuned Accuracy: {acc:.4f}")

Load checkpoints custom !
Fine-tuned checkpoint !


  checkpoint = torch.load(checkpoint_path, map_location="cpu")


Finetuned Accuracy: 0.5800


In [None]:
clf_finetuned = TabICLClassifier()
clf_finetuned.fit(X_train, y_train) 
y_pred_finetuned = clf_finetuned.predict(X_test)  # in-context learning happens here
# Compute accuracy
acc = accuracy_score(y_test, y_pred_finetuned)
print(f"Finetuned Accuracy: {acc:.4f}")

Load checkpoints custom !
Fine-tuned checkpoint !


  checkpoint = torch.load(checkpoint_path, map_location="cpu")


Finetuned Accuracy: 0.6200


### LoRA finetuning

In [1]:
import peft

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from peft import PeftModel
from peft import LoraConfig, get_peft_model

I want to apply LoRA to linear layers inside of the attention blocks.

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["linear1", "linear2", "attn.out_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

model.col_embedder = get_peft_model(model.col_embedder, lora_config)

for name, module in model.col_embedder.named_modules():
    if any(target in name for target in lora_config.target_modules):
        print("LoRA will be applied to in column embedder:", name)

LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.base_layer
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_dropout
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_dropout.default
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_A
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_A.default
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_B
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_B.default
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_embedding_A
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_embedding_B
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_magnitude_vector
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear2
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear2.base_layer

