In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import joblib

In [3]:
# 1) Load the cleaned transactions
df = pd.read_csv("../data/transactions_clean.csv", parse_dates=["InvoiceDate"])
print("Loaded transactions:", df.shape)

# 2) Feature Engineering
# ──────────────────────
# Extract time features
df["hour"] = df["InvoiceDate"].dt.hour
df["day_of_week"] = df["InvoiceDate"].dt.dayofweek
df["month"] = df["InvoiceDate"].dt.month

Loaded transactions: (779495, 10)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779495 entries, 0 to 779494
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      779495 non-null  int64         
 1   StockCode    779495 non-null  int64         
 2   Description  779495 non-null  object        
 3   Quantity     779495 non-null  int64         
 4   InvoiceDate  779495 non-null  datetime64[ns]
 5   Price        779495 non-null  float64       
 6   Customer ID  779495 non-null  float64       
 7   Country      779495 non-null  int64         
 8   TotalPrice   779495 non-null  float64       
 9   Month        779495 non-null  object        
 10  hour         779495 non-null  int32         
 11  day_of_week  779495 non-null  int32         
 12  month        779495 non-null  int32         
dtypes: datetime64[ns](1), float64(3), int32(3), int64(4), object(2)
memory usage: 68.4+ MB


In [10]:
# Define features and target
target = "TotalPrice"
categorical_cols = ["StockCode", "Country"]
numerical_cols   = ["Quantity", "Price", "hour", "day_of_week", "month"]
features = categorical_cols + numerical_cols

# 3) Encode categoricals
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    joblib.dump(le, f"../models/enc_{col}.pkl")  # save encoders

In [11]:
# 4) Prepare data arrays
X = df[features].values
y = df[target].values.reshape(-1, 1)

In [12]:
# 5) Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (623596, 7) Test size: (155899, 7)


In [14]:
# 6) Scale numerical features
#    We scale only the numerical part of X
num_start = len(categorical_cols)
scaler = StandardScaler()
X_train[:, num_start:] = scaler.fit_transform(X_train[:, num_start:])
X_test[:, num_start:]  = scaler.transform(X_test[:, num_start:])
joblib.dump(scaler, "../models/scaler_num.pkl")

['../models/scaler_num.pkl']

In [22]:
# 7) Self‑Supervised Pretraining
# ────────────────────────────────
from pytorch_tabnet.callbacks import EarlyStopping

pretrainer = TabNetPretrainer(
    input_dim=X_train.shape[1],
    mask_type='entmax'  # sparsity in masks
)
# define callbacks
es = EarlyStopping(
    patience=20,
    early_stopping_metric="loss",  # Metric to monitor
    is_maximize=False              # Set to False for minimizing loss
)

# fit with callbacks and verbose logging
pretrainer.fit(
    X_train,
    max_epochs=10,
    batch_size=1024,
    virtual_batch_size=128,
    pretraining_ratio=0.8,
    num_workers=3,
    drop_last=False,
    callbacks=[es]
    
)

pretrainer.save_model("models/tabnet_pretrainer.zip")
print("✅ Pretraining complete.")



epoch 0  | loss: 2.61876 |  0:00:40s
epoch 1  | loss: 1.57686 |  0:01:20s
epoch 1  | loss: 1.57686 |  0:01:20s
epoch 2  | loss: 1.51272 |  0:02:01s
epoch 2  | loss: 1.51272 |  0:02:01s
epoch 3  | loss: 1.48458 |  0:02:43s
epoch 3  | loss: 1.48458 |  0:02:43s
epoch 4  | loss: 1.38391 |  0:03:24s
epoch 4  | loss: 1.38391 |  0:03:24s
epoch 5  | loss: 1.3732  |  0:04:06s
epoch 5  | loss: 1.3732  |  0:04:06s
epoch 6  | loss: 1.34664 |  0:04:48s
epoch 6  | loss: 1.34664 |  0:04:48s
epoch 7  | loss: 1.24544 |  0:05:27s
epoch 7  | loss: 1.24544 |  0:05:27s
epoch 8  | loss: 1.23272 |  0:06:08s
epoch 8  | loss: 1.23272 |  0:06:08s
epoch 9  | loss: 1.23958 |  0:06:48s
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_loss = 1.23272
Successfully saved model at models/tabnet_pretrainer.zip.zip
✅ Pretraining complete.
epoch 9  | loss: 1.23958 |  0:06:48s
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_loss = 1.23272
Successfully saved model



In [29]:
# 8) Fine‑Tuning as Regressor
# ────────────────────────────
regressor = TabNetRegressor(
    input_dim=X_train.shape[1],
    output_dim=1,
    optimizer_fn=torch.optim.Adam,
    optimizer_params={"lr":1e-3, "weight_decay":1e-4},
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR
)

# load pretrained encoder weights
# Instead of regressor.load_weights_from_unsupervised(pretrainer)
# use the following:
regressor._set_network() # Initialize the network attribute
regressor.load_weights_from_unsupervised(pretrainer) # Load weights

In [30]:

# fit on train, evaluate on test
regressor.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_name=['test'],
    max_epochs=10,
    patience=30,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

epoch 0  | loss: 53515.18241| test_mse: 39750.26887|  0:00:30s
epoch 1  | loss: 52325.77821| test_mse: 33417.85921|  0:01:00s
epoch 1  | loss: 52325.77821| test_mse: 33417.85921|  0:01:00s
epoch 2  | loss: 51797.19492| test_mse: 36350.73718|  0:01:30s
epoch 2  | loss: 51797.19492| test_mse: 36350.73718|  0:01:30s
epoch 3  | loss: 51438.10442| test_mse: 17216.75714|  0:02:00s
epoch 3  | loss: 51438.10442| test_mse: 17216.75714|  0:02:00s
epoch 4  | loss: 51019.51336| test_mse: 5369.28622|  0:02:30s
epoch 4  | loss: 51019.51336| test_mse: 5369.28622|  0:02:30s
epoch 5  | loss: 50719.10631| test_mse: 8554.86958|  0:03:00s
epoch 5  | loss: 50719.10631| test_mse: 8554.86958|  0:03:00s
epoch 6  | loss: 50473.8514| test_mse: 5597.94953|  0:03:31s
epoch 6  | loss: 50473.8514| test_mse: 5597.94953|  0:03:31s
epoch 7  | loss: 50068.50359| test_mse: 37024.6668|  0:04:00s
epoch 7  | loss: 50068.50359| test_mse: 37024.6668|  0:04:00s
epoch 8  | loss: 49824.88877| test_mse: 22331.45469|  0:04:32s
ep



In [31]:
# Save the fine‑tuned model
regressor.save_model("models/tabnet_regressor.zip")
print("✅ Fine‑tuning complete. Model saved to models/tabnet_regressor.zip")

Successfully saved model at models/tabnet_regressor.zip.zip
✅ Fine‑tuning complete. Model saved to models/tabnet_regressor.zip
