In [29]:
from utils.download_kaggle_dataset import download_kaggle_competition
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [4]:
data = download_kaggle_competition("store-sales-time-series-forecasting")
print(data)

Downloading competition data: store-sales-time-series-forecasting
Unzipping files...
Competition data downloaded to: datasets
datasets


In [20]:
train_df = pd.read_csv(f"{data}/train.csv", parse_dates=['date'])
test_df = pd.read_csv(f"{data}/test.csv", parse_dates=['date'])
stores_df = pd.read_csv(f"{data}/stores.csv")
oil_df = pd.read_csv(f"{data}/oil.csv", parse_dates=['date'])

train_df = train_df.merge(stores_df, on='store_nbr', how='left')
test_df = test_df.merge(stores_df, on='store_nbr', how='left')

oil_df = oil_df.sort_values('date').fillna(method='ffill')
train_df = train_df.merge(oil_df, on='date', how='left')
test_df = test_df.merge(oil_df, on='date', how='left')


  oil_df = oil_df.sort_values('date').fillna(method='ffill')


In [21]:
for df in [train_df, test_df]:
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dow'] = df['date'].dt.dayofweek

In [22]:
train_df['sales'] = train_df['sales'].clip(lower=0)
train_df['sales_log'] = np.log1p(train_df['sales'])

In [23]:
features = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster',
            'onpromotion', 'dcoilwtico', 'day', 'month', 'year', 'dow']
target = 'sales_log'


In [24]:
categorical_cols = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster']
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_df[categorical_cols] = encoder.fit_transform(train_df[categorical_cols])
test_df[categorical_cols] = encoder.transform(test_df[categorical_cols])

In [25]:
train_df[features] = train_df[features].fillna(-1)
test_df[features] = test_df[features].fillna(-1)

In [26]:
X_train, X_val, y_train, y_val = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)


In [27]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [30]:
tabnet = TabNetRegressor(
    n_d=16, n_a=16, n_steps=5,
    gamma=1.5, lambda_sparse=1e-4,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    verbose=0
)

tabnet.fit(
    X_train_scaled, y_train.values.reshape(-1, 1),
    eval_set=[(X_val_scaled, y_val.values.reshape(-1, 1))],
    eval_metric=['rmse'],
    max_epochs=100,
    patience=10,
    batch_size=2048,
    virtual_batch_size=256
)

y_pred_tabnet = tabnet.predict(X_val_scaled).flatten()
rmsle_tabnet = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_tabnet)))
print(f"✅ TabNet RMSLE: {rmsle_tabnet:.4f}")


Early stopping occurred at epoch 19 with best_epoch = 9 and best_val_0_rmse = 0.87058




✅ TabNet RMSLE: 0.8706
