### Imports

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.init as init
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

import os

import arff
import pandas as pd

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from typing import Optional, List, Dict

In [3]:
import sys
import os

# Add current folder to sys.path so Python sees "tabicl"
sys.path.insert(0, os.path.abspath("."))

from tabicl.src.tabicl.model.tabicl import TabICL
from tabicl.src.tabicl.sklearn.classifier import TabICLClassifier
from tabicl.src.tabicl.sklearn.preprocessing import (
    TransformToNumerical,
    EnsembleGenerator,
)

  from .autonotebook import tqdm as notebook_tqdm


### Load and process dataset

In [2]:
with open("data.arff") as f:
    dataset = arff.load(f)
df = pd.DataFrame(dataset["data"], columns=[attr[0] for attr in dataset["attributes"]])
print(df.head())

TypeError: 'generator' object is not subscriptable

In [4]:
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of rows: 500
Number of columns: 13


In [5]:
# Separate features and target
X = df.drop(columns=["Class"])
y = df["Class"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

print("Classes: ", y.unique())

Train: (400, 12) (400,)
Val:   (50, 12) (50,)
Test:  (50, 12) (50,)
Classes:  ['2' '1']


In [6]:
y_encoder_ = LabelEncoder()
y_train = y_encoder_.fit_transform(y_train)
classes_ = y_encoder_.classes_
n_classes_ = len(y_encoder_.classes_)


#  Transform input features
X_encoder_ = TransformToNumerical(verbose=False)
X_train = X_encoder_.fit_transform(X_train)

n_estimators: int = (32,)
norm_methods: Optional[str | List[str]] = (None,)
feat_shuffle_method: str = ("latin",)
class_shift: bool = (True,)
outlier_threshold: float = (4.0,)
softmax_temperature: float = (0.9,)
average_logits: bool = (True,)
use_hierarchical: bool = True
random_state: int | None = (42,)

seed = random_state if isinstance(random_state, (int, type(None))) else None

# Fit ensemble generator to create multiple dataset views
ensemble_generator_ = EnsembleGenerator(
    n_estimators=32,
    norm_methods=["none", "power"],
    feat_shuffle_method="latin",
    class_shift=class_shift,
    outlier_threshold=outlier_threshold,
    random_state=seed,
)
ensemble_generator_.fit(X_train, y_train)


### Split data into Train and Test

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=seed, stratify=y
# )

In [7]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

# X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test, dtype=torch.long)

X_train_tensor = X_train_tensor.unsqueeze(0)
y_train_tensor = y_train_tensor.unsqueeze(0)

print("X train tensor shape: ", X_train_tensor.shape)
print("y train tensor shape: ", y_train_tensor.shape)

X train tensor shape:  torch.Size([1, 400, 12])
y train tensor shape:  torch.Size([1, 400])


### Load model and weights

In [4]:
model = TabICL()


def cnt_params(model):
    return sum(param.numel() for param in model.parameters())


print("Number of parameters: ", cnt_params(model))

Number of parameters:  27051666


In [7]:
model.col_embedder

ColEmbedding(
  (in_linear): SkippableLinear(in_features=1, out_features=128, bias=True)
  (tf_col): SetTransformer(
    (blocks): ModuleList(
      (0-2): 3 x InducedSelfAttentionBlock(
        (multihead_attn1): MultiheadAttentionBlock(
          (linear1): Linear(in_features=128, out_features=256, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (linear2): Linear(in_features=256, out_features=128, bias=True)
          (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.0, inplace=False)
          (dropout2): Dropout(p=0.0, inplace=False)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
        )
        (multihead_attn2): MultiheadAttentionBlock(
          (linear1): Linear(in_features=128, out_features=256, bias=True)
          (dropout): Dro

In [9]:
for name, module in model.named_modules():
    params = sum(p.numel() for p in module.parameters() if p.requires_grad)
    if params > 0:
        print(f"Module: {name} | Params: {params}")

Module:  | Params: 27051658
Module: col_embedder | Params: 877824
Module: col_embedder.in_linear | Params: 256
Module: col_embedder.tf_col | Params: 844032
Module: col_embedder.tf_col.blocks | Params: 844032
Module: col_embedder.tf_col.blocks.0 | Params: 281344
Module: col_embedder.tf_col.blocks.0.multihead_attn1 | Params: 132480
Module: col_embedder.tf_col.blocks.0.multihead_attn1.linear1 | Params: 33024
Module: col_embedder.tf_col.blocks.0.multihead_attn1.linear2 | Params: 32896
Module: col_embedder.tf_col.blocks.0.multihead_attn1.norm1 | Params: 256
Module: col_embedder.tf_col.blocks.0.multihead_attn1.norm2 | Params: 256
Module: col_embedder.tf_col.blocks.0.multihead_attn1.attn | Params: 66048
Module: col_embedder.tf_col.blocks.0.multihead_attn1.attn.out_proj | Params: 16512
Module: col_embedder.tf_col.blocks.0.multihead_attn2 | Params: 132480
Module: col_embedder.tf_col.blocks.0.multihead_attn2.linear1 | Params: 33024
Module: col_embedder.tf_col.blocks.0.multihead_attn2.linear2 | P

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(
    "/home/D32485/exercice/tabicl-classifier-v1.1-0506.ckpt", map_location="cpu"
)
model.load_state_dict(checkpoint["state_dict"])
model.to(device)

  checkpoint = torch.load(


TabICL(
  (col_embedder): ColEmbedding(
    (in_linear): SkippableLinear(in_features=1, out_features=128, bias=True)
    (tf_col): SetTransformer(
      (blocks): ModuleList(
        (0-2): 3 x InducedSelfAttentionBlock(
          (multihead_attn1): MultiheadAttentionBlock(
            (linear1): Linear(in_features=128, out_features=256, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (linear2): Linear(in_features=256, out_features=128, bias=True)
            (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.0, inplace=False)
            (dropout2): Dropout(p=0.0, inplace=False)
            (attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
            )
          )
          (multihead_attn2): MultiheadAttentionBlock(
            (linear1): Linear(in_featu

### Fine-tuning

In [26]:
batch_size, T, H = X_train_tensor.shape
d = torch.full((batch_size,), H, dtype=torch.long)
seq_len = T
train_size = 300

print("Batch size: ", batch_size)
print("seq_len : ", seq_len)
print("train_size : ", train_size)

seq_len_tensor = torch.full((batch_size,), seq_len, dtype=torch.long)  # or float
train_size_tensor = torch.full((batch_size,), train_size, dtype=torch.long)

batch = X_train_tensor, y_train_tensor, d, seq_len_tensor, train_size_tensor

config = {
    "max_classes": 10,
    "embed_dim": 128,
    "col_num_blocks": 3,
    "col_nhead": 4,
    "col_num_inds": 128,
    "row_num_blocks": 3,
    "row_nhead": 8,
    "row_num_cls": 4,
    "row_rope_base": 100000,
    "icl_num_blocks": 12,
    "icl_nhead": 4,
    "ff_factor": 2,
    "dropout": 0.0,
    "activation": "gelu",
    "norm_first": True,
}
# results = trainer.run_batch(batch)

Batch size:  1
seq_len :  400
train_size :  300


In [27]:
def train(
    model,
    micro_X,
    y,
    train_len,
    micro_d,
    device,
    config,
    learning_rate=5e-5,
    epochs=40,
    save_path="checkpoints",
):
    os.makedirs(save_path, exist_ok=True)

    model.to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(
        optimizer, mode="min", factor=0.5, patience=5, verbose=True
    )

    train_losses = []
    train_accuracies = []
    best_loss = float("inf")

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        y_train = y[:, :train_len].to(device)
        y_test = y[:, train_len:].to(device)
        micro_X = micro_X.to(device)
        micro_d = micro_d.to(device)

        pred = model(micro_X, y_train, micro_d)  # (B, test_size, max_classes)
        pred = pred.flatten(end_dim=-2)
        true = y_test.long().flatten()

        loss = criterion(pred, true)
        loss.backward()
        optimizer.step()
        scheduler.step(loss)

        _, predicted = pred.max(1)
        accuracy = (predicted == true).sum().item() / true.size(0)

        train_losses.append(loss.item())
        train_accuracies.append(accuracy)

        print(
            f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, "
            f"lr: {optimizer.param_groups[0]['lr']:.6f}, Accuracy: {accuracy:.6f}"
        )

        # ---- Save best checkpoint ----
        if loss.item() < best_loss:
            best_loss = loss.item()
            checkpoint = {"config": config, "state_dict": model.state_dict()}
            torch.save(checkpoint, os.path.join(save_path, "model_best.pth"))
            print(f"Best model saved (loss={best_loss:.4f})")

    # ---- Save final model ----
    final_ckpt = {"config": config, "state_dict": model.state_dict()}
    torch.save(final_ckpt, os.path.join(save_path, "model_final.pth"))
    print("Final model saved.")

    return train_losses, train_accuracies


In [28]:
train_losses, train_accuracies = train(
    model=model,
    micro_X=X_train_tensor,
    y=y_train_tensor,
    train_len=300,
    micro_d=d,
    device=device,
    config=config,
    learning_rate=5e-5,
    epochs=30,
)



Train !
Out shape :  torch.Size([1, 100, 10])
Epoch 1/30, Loss: 0.6672, lr: 0.000050, Accuracy: 0.600000
Best model saved (loss=0.6672)
Train !
Out shape :  torch.Size([1, 100, 10])
Epoch 2/30, Loss: 0.6355, lr: 0.000050, Accuracy: 0.620000
Best model saved (loss=0.6355)
Train !
Out shape :  torch.Size([1, 100, 10])
Epoch 3/30, Loss: 0.7548, lr: 0.000050, Accuracy: 0.680000
Train !
Out shape :  torch.Size([1, 100, 10])
Epoch 4/30, Loss: 0.5786, lr: 0.000050, Accuracy: 0.700000
Best model saved (loss=0.5786)
Train !
Out shape :  torch.Size([1, 100, 10])
Epoch 5/30, Loss: 0.5339, lr: 0.000050, Accuracy: 0.730000
Best model saved (loss=0.5339)
Train !
Out shape :  torch.Size([1, 100, 10])
Epoch 6/30, Loss: 0.4680, lr: 0.000050, Accuracy: 0.790000
Best model saved (loss=0.4680)
Train !
Out shape :  torch.Size([1, 100, 10])
Epoch 7/30, Loss: 0.4068, lr: 0.000050, Accuracy: 0.810000
Best model saved (loss=0.4068)
Train !
Out shape :  torch.Size([1, 100, 10])
Epoch 8/30, Loss: 0.3372, lr: 0.0

### Inference

In [6]:
clf_original = TabICLClassifier(checkpoint_version="original")
clf_original.fit(X_val, y_val)  # this is cheap
y_pred_original = clf_original.predict(X_test)  # in-context learning happens here
# Compute accuracy
acc = accuracy_score(y_test, y_pred_original)
print(f"Baseline Accuracy: {acc:.4f}")

Load checkpoints custom !
Original checkpoint !


  checkpoint = torch.load(


Inference !
Inference !
Inference !
Inference !
Baseline Accuracy: 0.5400


In [9]:
clf_original = TabICLClassifier(checkpoint_version="original")
clf_original.fit(X_train, y_train)  # this is cheap
y_pred_original = clf_original.predict(X_test)  # in-context learning happens here
# Compute accuracy
acc = accuracy_score(y_test, y_pred_original)
print(f"Baseline Accuracy: {acc:.4f}")

Load checkpoints custom !
Original checkpoint !


  checkpoint = torch.load(


Inference !
Inference !
Inference !
Inference !
Baseline Accuracy: 0.6800


In [8]:
clf_finetuned = TabICLClassifier()
clf_finetuned.fit(X_val, y_val)  # this is cheap
y_pred_finetuned = clf_finetuned.predict(X_test)  # in-context learning happens here
# Compute accuracy
acc = accuracy_score(y_test, y_pred_finetuned)
print(f"Finetuned Accuracy: {acc:.4f}")

Load checkpoints custom !
Fine-tuned checkpoint !


  checkpoint = torch.load(


Inference !
Inference !
Inference !
Inference !
Finetuned Accuracy: 0.6200


In [10]:
clf_finetuned = TabICLClassifier()
clf_finetuned.fit(X_train, y_train)  # this is cheap
y_pred_finetuned = clf_finetuned.predict(X_test)  # in-context learning happens here
# Compute accuracy
acc = accuracy_score(y_test, y_pred_finetuned)
print(f"Finetuned Accuracy: {acc:.4f}")

Load checkpoints custom !
Fine-tuned checkpoint !


  checkpoint = torch.load(


Inference !
Inference !
Inference !
Inference !
Finetuned Accuracy: 0.6000


### LoRA finetuning

In [1]:
import peft

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from peft import PeftModel
from peft import LoraConfig, get_peft_model

Apply LoRA to linear layers inside of the attention mechanism.

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["linear1", "linear2", "attn.out_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

model.col_embedder = get_peft_model(model.col_embedder, lora_config)

for name, module in model.col_embedder.named_modules():
    if any(target in name for target in lora_config.target_modules):
        print("LoRA will be applied to in column embedder:", name)

LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.base_layer
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_dropout
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_dropout.default
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_A
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_A.default
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_B
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_B.default
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_embedding_A
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_embedding_B
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear1.lora_magnitude_vector
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear2
LoRA will be applied to: tf_col.blocks.0.multihead_attn1.linear2.base_layer

