In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import joblib

In [13]:
# 1) Load the cleaned transactions
df = pd.read_csv("../data/transactions_clean.csv", parse_dates=["InvoiceDate"])
print("Loaded transactions:", df.shape)

# 2) Feature Engineering
# ──────────────────────
# Extract time features
df["hour"] = df["InvoiceDate"].dt.hour
df["day_of_week"] = df["InvoiceDate"].dt.dayofweek
df["month"] = df["InvoiceDate"].dt.month

Loaded transactions: (619215, 10)


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619215 entries, 0 to 619214
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      619215 non-null  int64         
 1   StockCode    619215 non-null  object        
 2   Description  619215 non-null  object        
 3   Quantity     619215 non-null  int64         
 4   InvoiceDate  619215 non-null  datetime64[ns]
 5   Price        619215 non-null  float64       
 6   Customer ID  619215 non-null  float64       
 7   Country      619215 non-null  object        
 8   TotalPrice   619215 non-null  float64       
 9   Month        619215 non-null  object        
 10  hour         619215 non-null  int32         
 11  day_of_week  619215 non-null  int32         
 12  month        619215 non-null  int32         
dtypes: datetime64[ns](1), float64(3), int32(3), int64(2), object(4)
memory usage: 54.3+ MB


In [15]:
# Define features and target
target = "TotalPrice"
categorical_cols = ["StockCode", "Country"]
numerical_cols   = ["Quantity", "Price", "hour", "day_of_week", "month"]
features = categorical_cols + numerical_cols

# 3) Encode categoricals
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    joblib.dump(le, f"../models/enc_{col}.pkl")  # save encoders

In [16]:
# 4) Prepare data arrays
X = df[features].values
y = df[target].values.reshape(-1, 1)

In [17]:
# 5) Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (495372, 7) Test size: (123843, 7)


In [18]:
# 6) Scale numerical features
#    We scale only the numerical part of X
num_start = len(categorical_cols)
scaler = StandardScaler()
X_train[:, num_start:] = scaler.fit_transform(X_train[:, num_start:])
X_test[:, num_start:]  = scaler.transform(X_test[:, num_start:])
joblib.dump(scaler, "../models/scaler_num.pkl")

['../models/scaler_num.pkl']

In [19]:
# 7) Self‑Supervised Pretraining
# ────────────────────────────────
from pytorch_tabnet.callbacks import EarlyStopping

pretrainer = TabNetPretrainer(
    input_dim=X_train.shape[1],
    mask_type='entmax'  # sparsity in masks
)
# define callbacks
es = EarlyStopping(
    patience=20,
    early_stopping_metric="loss",  # Metric to monitor
    is_maximize=False              # Set to False for minimizing loss
)

# fit with callbacks and verbose logging
pretrainer.fit(
    X_train,
    max_epochs=100,
    batch_size=1024,
    virtual_batch_size=128,
    pretraining_ratio=0.8,
    num_workers=3,
    drop_last=False,
    callbacks=[es]
    
)

pretrainer.save_model("../models/tabnet_pretrainer")
print("✅ Pretraining complete.")



epoch 0  | loss: 1.60579 |  0:00:52s
epoch 1  | loss: 1.10068 |  0:01:50s
epoch 2  | loss: 1.00544 |  0:02:49s
epoch 3  | loss: 0.99817 |  0:03:51s
epoch 4  | loss: 0.99776 |  0:04:49s
epoch 5  | loss: 0.99624 |  0:05:43s
epoch 6  | loss: 0.99429 |  0:06:40s
epoch 7  | loss: 0.99361 |  0:07:48s
epoch 8  | loss: 0.99011 |  0:08:57s
epoch 9  | loss: 0.99171 |  0:10:05s
epoch 10 | loss: 0.98976 |  0:11:11s
epoch 11 | loss: 0.98924 |  0:12:16s
epoch 12 | loss: 0.98908 |  0:13:20s
epoch 13 | loss: 0.98722 |  0:14:21s
epoch 14 | loss: 0.98578 |  0:15:22s
epoch 15 | loss: 0.98541 |  0:16:21s
epoch 16 | loss: 0.9841  |  0:17:21s
epoch 17 | loss: 0.9824  |  0:18:16s
epoch 18 | loss: 0.98352 |  0:19:19s
epoch 19 | loss: 0.98351 |  0:20:21s
epoch 20 | loss: 0.98254 |  0:21:21s
epoch 21 | loss: 0.98181 |  0:22:13s
epoch 22 | loss: 0.98183 |  0:23:02s
epoch 23 | loss: 0.98205 |  0:23:56s
epoch 24 | loss: 0.9815  |  0:24:57s
epoch 25 | loss: 0.9806  |  0:25:52s
epoch 26 | loss: 0.98009 |  0:26:53s
e

KeyboardInterrupt: 

In [None]:
# 8) Fine‑Tuning as Regressor
# ────────────────────────────
regressor = TabNetRegressor(
    input_dim=X_train.shape[1],
    output_dim=1,
    optimizer_fn=torch.optim.Adam,
    optimizer_params={"lr":1e-3, "weight_decay":1e-4},
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR
)

# load pretrained encoder weights
# Instead of regressor.load_weights_from_unsupervised(pretrainer)
# use the following:
regressor._set_network() # Initialize the network attribute
regressor.load_weights_from_unsupervised(pretrainer) # Load weights



In [None]:

# fit on train, evaluate on test
regressor.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_name=['test'],
    max_epochs=100,
    patience=30,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=3,
    drop_last=False
)

epoch 0  | loss: 53.24846| test_mse: 4.43097 |  0:00:49s
epoch 1  | loss: 3.36528 | test_mse: 0.8433  |  0:01:44s
epoch 2  | loss: 1.70072 | test_mse: 0.47542 |  0:02:39s
epoch 3  | loss: 1.19859 | test_mse: 0.27193 |  0:03:35s
epoch 4  | loss: 0.99804 | test_mse: 0.28417 |  0:04:30s
epoch 5  | loss: 0.85287 | test_mse: 0.15566 |  0:05:25s
epoch 6  | loss: 0.73666 | test_mse: 0.19648 |  0:06:20s
epoch 7  | loss: 0.6465  | test_mse: 0.20585 |  0:07:08s
epoch 8  | loss: 0.57168 | test_mse: 0.45007 |  0:08:03s
epoch 9  | loss: 0.552   | test_mse: 0.1112  |  0:08:57s
epoch 10 | loss: 0.48629 | test_mse: 0.2328  |  0:09:46s
epoch 11 | loss: 0.43564 | test_mse: 0.12064 |  0:10:32s
epoch 12 | loss: 0.38783 | test_mse: 0.13389 |  0:11:20s
epoch 13 | loss: 0.39943 | test_mse: 0.2317  |  0:12:26s
epoch 14 | loss: 0.3571  | test_mse: 0.0922  |  0:14:19s
epoch 15 | loss: 0.34976 | test_mse: 0.09242 |  0:16:23s
epoch 16 | loss: 0.32643 | test_mse: 0.13798 |  0:18:16s
epoch 17 | loss: 0.32111 | test



In [None]:
# Save the fine‑tuned model
regressor.save_model("../models/tabnet_regressor")
print("✅ Fine‑tuning complete. Model saved to models/tabnet_regressor.zip")

Successfully saved model at ../models/tabnet_regressor.zip
✅ Fine‑tuning complete. Model saved to models/tabnet_regressor.zip
