In [1]:
# evaluate_tabnet.py
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from pytorch_tabnet.tab_model import TabNetRegressor

In [2]:
# 1) Load cleaned transaction data
df = pd.read_csv("../data/transactions_clean.csv", parse_dates=["InvoiceDate"])
print("✅ Loaded cleaned data:", df.shape)

✅ Loaded cleaned data: (779495, 10)


In [3]:

# 2) Feature engineering (must match train_tabnet.py)
df["hour"]        = df["InvoiceDate"].dt.hour
df["day_of_week"] = df["InvoiceDate"].dt.dayofweek
df["month"]       = df["InvoiceDate"].dt.month

In [4]:
# 3) Define features & target
categorical_cols = ["StockCode", "Country"]
numerical_cols   = ["Quantity", "Price", "hour", "day_of_week", "month"]
features         = categorical_cols + numerical_cols
target           = "TotalPrice"

In [5]:
# 4) Build X, y (no encoding yet)
X = df[features]
y = df[target].values.reshape(-1, 1)

# 5) Split into train/test so test only contains seen categories
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"✅ Split data → train: {X_train.shape}, test: {X_test.shape}")




✅ Split data → train: (623596, 7), test: (155899, 7)


In [6]:
# 6) Apply OrdinalEncoder with handling for unknown values on categorical columns
# Ensure the categorical columns are treated as strings
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Initialize and fit the encoder on training data only
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(X_train[categorical_cols])

# Transform both train and test data
X_train[categorical_cols] = encoder.transform(X_train[categorical_cols])
X_test[categorical_cols] = encoder.transform(X_test[categorical_cols])
print("✅ Applied OrdinalEncoder with handle_unknown for:", categorical_cols)


✅ Applied OrdinalEncoder with handle_unknown for: ['StockCode', 'Country']


In [7]:

# 7) Convert DataFrames to numpy arrays before scaling/model
X_train = X_train.values
X_test  = X_test.values

In [9]:
# 8) Load the saved StandardScaler and apply it to numerical features
num_start = len(categorical_cols)  # numerical columns start after categorical ones

# Instead of fitting a new scaler, load the one saved during training
scaler = joblib.load("../models/scaler_num.pkl")
X_test[:, num_start:] = scaler.transform(X_test[:, num_start:])
print("✅ Scaled numerical features with saved scaler")

✅ Scaled numerical features with saved scaler


In [11]:
# 9) Load the fine‑tuned TabNet model
reg = TabNetRegressor()
reg.load_model("../models/tabnet_regressor.zip")  # Note: fixed the filename
print("✅ TabNet model loaded")

✅ TabNet model loaded




In [12]:
# 10) Predict & evaluate
y_pred = reg.predict(X_test)

# Flatten the arrays in case they are (n, 1)
y_test_flat = y_test.flatten()
y_pred_flat = y_pred.flatten()

rmse = np.sqrt(mean_squared_error(y_test_flat, y_pred_flat))
print(f"\n🎯 Test RMSE: {rmse:.4f}")


🎯 Test RMSE: 144.8661
