In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import joblib

In [14]:
# 1) Load the cleaned transactions
df = pd.read_csv("../data/transactions_clean.csv", parse_dates=["InvoiceDate"])
print("Loaded transactions:", df.shape)

# 2) Feature Engineering
# ──────────────────────
# Extract time features
df["hour"] = df["InvoiceDate"].dt.hour
df["day_of_week"] = df["InvoiceDate"].dt.dayofweek
df["month"] = df["InvoiceDate"].dt.month

Loaded transactions: (779495, 10)


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779495 entries, 0 to 779494
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      779495 non-null  int64         
 1   StockCode    779495 non-null  object        
 2   Description  779495 non-null  object        
 3   Quantity     779495 non-null  int64         
 4   InvoiceDate  779495 non-null  datetime64[ns]
 5   Price        779495 non-null  float64       
 6   Customer ID  779495 non-null  float64       
 7   Country      779495 non-null  object        
 8   TotalPrice   779495 non-null  float64       
 9   Month        779495 non-null  object        
 10  hour         779495 non-null  int32         
 11  day_of_week  779495 non-null  int32         
 12  month        779495 non-null  int32         
dtypes: datetime64[ns](1), float64(3), int32(3), int64(2), object(4)
memory usage: 68.4+ MB


In [16]:
# Define features and target
target = "TotalPrice"
categorical_cols = ["StockCode", "Country"]
numerical_cols   = ["Quantity", "Price", "hour", "day_of_week", "month"]
features = categorical_cols + numerical_cols

# 3) Encode categoricals
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    joblib.dump(le, f"../models/enc_{col}.pkl")  # save encoders

In [17]:
# 4) Prepare data arrays
X = df[features].values
y = df[target].values.reshape(-1, 1)

In [18]:
# 5) Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (623596, 7) Test size: (155899, 7)


In [19]:
# 6) Scale numerical features
#    We scale only the numerical part of X
num_start = len(categorical_cols)
scaler = StandardScaler()
X_train[:, num_start:] = scaler.fit_transform(X_train[:, num_start:])
X_test[:, num_start:]  = scaler.transform(X_test[:, num_start:])
joblib.dump(scaler, "../models/scaler_num.pkl")

['../models/scaler_num.pkl']

In [9]:
# 7) Self‑Supervised Pretraining
# ────────────────────────────────
from pytorch_tabnet.callbacks import EarlyStopping

pretrainer = TabNetPretrainer(
    input_dim=X_train.shape[1],
    mask_type='entmax'  # sparsity in masks
)
# define callbacks
es = EarlyStopping(
    patience=20,
    early_stopping_metric="loss",  # Metric to monitor
    is_maximize=False              # Set to False for minimizing loss
)

# fit with callbacks and verbose logging
pretrainer.fit(
    X_train,
    max_epochs=100,
    batch_size=1024,
    virtual_batch_size=128,
    pretraining_ratio=0.8,
    num_workers=3,
    drop_last=False,
    callbacks=[es]
    
)

pretrainer.save_model("../models/tabnet_pretrainer")
print("✅ Pretraining complete.")



epoch 0  | loss: 2.41352 |  0:00:48s
epoch 1  | loss: 1.38137 |  0:01:31s
epoch 2  | loss: 1.3646  |  0:02:14s
epoch 3  | loss: 1.32862 |  0:02:58s
epoch 4  | loss: 1.28311 |  0:03:38s
epoch 5  | loss: 1.22697 |  0:04:23s
epoch 6  | loss: 1.22381 |  0:05:05s
epoch 7  | loss: 1.17268 |  0:05:46s
epoch 8  | loss: 1.14406 |  0:06:26s
epoch 9  | loss: 1.10131 |  0:07:06s
epoch 10 | loss: 1.0797  |  0:07:48s
epoch 11 | loss: 1.11537 |  0:08:29s
epoch 12 | loss: 1.06347 |  0:09:09s
epoch 13 | loss: 1.07211 |  0:09:50s
epoch 14 | loss: 1.08134 |  0:10:29s
epoch 15 | loss: 1.05163 |  0:11:09s
epoch 16 | loss: 1.04063 |  0:11:49s
epoch 17 | loss: 1.05251 |  0:12:30s
epoch 18 | loss: 1.03585 |  0:13:12s
epoch 19 | loss: 1.02945 |  0:13:57s
epoch 20 | loss: 1.02196 |  0:14:42s
epoch 21 | loss: 1.02766 |  0:15:24s
epoch 22 | loss: 1.01633 |  0:16:05s
epoch 23 | loss: 1.0159  |  0:16:46s
epoch 24 | loss: 1.01559 |  0:17:29s
epoch 25 | loss: 1.0008  |  0:18:13s
epoch 26 | loss: 1.00599 |  0:18:57s
e



In [10]:
# 8) Fine‑Tuning as Regressor
# ────────────────────────────
regressor = TabNetRegressor(
    input_dim=X_train.shape[1],
    output_dim=1,
    optimizer_fn=torch.optim.Adam,
    optimizer_params={"lr":1e-3, "weight_decay":1e-4},
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR
)

# load pretrained encoder weights
# Instead of regressor.load_weights_from_unsupervised(pretrainer)
# use the following:
regressor._set_network() # Initialize the network attribute
regressor.load_weights_from_unsupervised(pretrainer) # Load weights



In [11]:

# fit on train, evaluate on test
regressor.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_name=['test'],
    max_epochs=100,
    patience=30,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=3,
    drop_last=False
)

epoch 0  | loss: 53433.81957| test_mse: 42200.99097|  0:00:50s
epoch 1  | loss: 52150.70518| test_mse: 21908.58382|  0:01:40s
epoch 2  | loss: 51396.70429| test_mse: 41916.81119|  0:02:31s
epoch 3  | loss: 50986.27797| test_mse: 22043.13526|  0:03:22s
epoch 4  | loss: 50586.85062| test_mse: 12192.49054|  0:04:12s
epoch 5  | loss: 50238.15494| test_mse: 19971.6419|  0:05:03s
epoch 6  | loss: 49989.97393| test_mse: 12521.28089|  0:05:53s
epoch 7  | loss: 49674.9654| test_mse: 33704.25411|  0:06:43s
epoch 8  | loss: 49562.03424| test_mse: 31834.03896|  0:07:34s
epoch 9  | loss: 49282.25062| test_mse: 21310.37602|  0:08:25s
epoch 10 | loss: 49140.07537| test_mse: 20963.26524|  0:09:16s
epoch 11 | loss: 48923.82932| test_mse: 24895.83432|  0:10:07s
epoch 12 | loss: 48784.84036| test_mse: 23444.30536|  0:10:58s
epoch 13 | loss: 48642.10622| test_mse: 6855.4336|  0:11:48s
epoch 14 | loss: 48509.47177| test_mse: 20266.00081|  0:12:39s
epoch 15 | loss: 48460.18469| test_mse: 10635.60727|  0:13:



In [12]:
# Save the fine‑tuned model
regressor.save_model("../models/tabnet_regressor")
print("✅ Fine‑tuning complete. Model saved to models/tabnet_regressor.zip")

Successfully saved model at ../models/tabnet_regressor.zip
✅ Fine‑tuning complete. Model saved to models/tabnet_regressor.zip
