# LSTM Forecasting Pipeline

Mục tiêu: sử dụng bộ dữ liệu đã được feature engineering để huấn luyện một mô hình LSTM dự báo doanh số cho từng cửa hàng. Notebook này bao gồm các bước chuẩn hóa, tạo chuỗi thời gian, định nghĩa mô hình, huấn luyện, đánh giá và suy luận trên tập test.

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pathlib import Path

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

DATA_DIR = Path("/content/drive/MyDrive/ParquetFile")

In [7]:
train_data = pd.read_parquet(DATA_DIR / "train_fe.parquet").copy()
test_data = pd.read_parquet(DATA_DIR / "test_fe.parquet").copy()

print(f"Train raw shape: {train_data.shape}")
print(f"Test raw shape : {test_data.shape}")

Train raw shape: (830972, 28)
Test raw shape : (1113, 26)


In [8]:
train_data.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,...,IsWeekend,IsMonthStart,IsMonthEnd,CompetitionMonthsActive,Promo2WeeksActive,PromoIntervalActive,Lag_1,Lag_7,Rolling_Mean_7,Rolling_Std_7
0,1,3,5530.0,668,1,0,0,1,c,a,...,0,0,0,52.8,,0,0.0,,,
1,1,4,4327.0,578,1,0,0,1,c,a,...,0,0,0,52.833333,,0,5530.0,,,
2,1,5,4486.0,619,1,0,0,1,c,a,...,0,0,0,52.866667,,0,4327.0,,,
3,1,6,4997.0,635,1,0,0,1,c,a,...,1,0,0,52.9,,0,4486.0,,,
4,1,1,7176.0,785,1,1,0,1,c,a,...,0,0,0,52.966667,,0,0.0,,,


In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830972 entries, 0 to 830971
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Store                    830972 non-null  int64  
 1   DayOfWeek                830972 non-null  int32  
 2   Sales                    830972 non-null  float64
 3   Customers                830972 non-null  int64  
 4   Open                     830972 non-null  int64  
 5   Promo                    830972 non-null  int64  
 6   StateHoliday             830972 non-null  object 
 7   SchoolHoliday            830972 non-null  int64  
 8   StoreType                830972 non-null  object 
 9   Assortment               830972 non-null  object 
 10  CompetitionDistance      830972 non-null  float64
 11  Promo2                   830972 non-null  int64  
 12  CompetitionMissingFlag   830972 non-null  int64  
 13  LogSales                 830972 non-null  float64
 14  Year

In [10]:
# Harmonize schema after column pruning
TARGET_COL = "Sales"
ID_COL = "Store"
DATE_PARTS = ["Year", "Month", "Day"]
CAT_COLS = ["StateHoliday", "StoreType", "Assortment"]
LEAK_COLS = {TARGET_COL, "Customers", "LogSales", "Date"}
NA_ZERO_COLS = [
    "CompetitionMonthsActive",
    "Promo2WeeksActive",
    "Lag_1",
    "Lag_7",
    "Rolling_Mean_7",
    "Rolling_Std_7",
]

# Ensure optional columns exist for alignment
for col in [TARGET_COL, "Customers", "LogSales"]:
    if col not in test_data.columns:
        test_data[col] = np.nan

# Build timestamp from Year/Month/Day for ordering
for df in [train_data, test_data]:
    date_frame = df[DATE_PARTS].rename(columns={"Year": "year", "Month": "month", "Day": "day"})
    df["Date"] = pd.to_datetime(date_frame)

train_data["dataset"] = "train"
test_data["dataset"] = "test"
combined = pd.concat([train_data, test_data], ignore_index=True)

for col in CAT_COLS:
    combined[col] = combined[col].astype(str)

for col in NA_ZERO_COLS:
    if col in combined.columns:
        combined[col] = combined[col].fillna(0)

combined = pd.get_dummies(combined, columns=CAT_COLS, drop_first=True)
combined = combined.sort_values([ID_COL, "Date"]).reset_index(drop=True)

train_data = combined[combined["dataset"] == "train"].drop(columns=["dataset"]).reset_index(drop=True)
test_data = combined[combined["dataset"] == "test"].drop(columns=["dataset"]).reset_index(drop=True)

FEATURE_COLS = [col for col in train_data.columns if col not in LEAK_COLS and col != ID_COL]

print(f"Train shape (post-encoding): {train_data.shape}")
print(f"Test shape  (post-encoding): {test_data.shape}")
print(f"Feature count: {len(FEATURE_COLS)}")

Train shape (post-encoding): (830972, 34)
Test shape  (post-encoding): (1113, 34)
Feature count: 29


## 1. Load & Inspect Feature-Engineered Data

Đọc dữ liệu parquet đã qua feature engineering, chuẩn hóa định dạng thời gian và liệt kê danh sách đặc trưng sẽ đưa vào LSTM.

## 2. Chuẩn hóa & chia tập thời gian

Giữ nguyên thứ tự thời gian, cắt 6 tuần cuối làm validation và fit scaler trên phần train để tránh leakage.

In [11]:
VAL_WEEKS = 6
seq_horizon = pd.Timedelta(weeks=VAL_WEEKS)
split_date = train_data["Date"].max() - seq_horizon

train_main = train_data[train_data["Date"] < split_date].copy()
val_main = train_data[train_data["Date"] >= split_date].copy()

feature_scaler = StandardScaler()
target_scaler = StandardScaler()

train_main[FEATURE_COLS] = feature_scaler.fit_transform(train_main[FEATURE_COLS])
val_main[FEATURE_COLS] = feature_scaler.transform(val_main[FEATURE_COLS])
test_scaled = test_data.copy()
test_scaled[FEATURE_COLS] = feature_scaler.transform(test_scaled[FEATURE_COLS])

train_main[[TARGET_COL]] = target_scaler.fit_transform(train_main[[TARGET_COL]])
val_main[[TARGET_COL]] = target_scaler.transform(val_main[[TARGET_COL]])

print(f"Split date: {split_date.date()}")
print(f"Train rows: {len(train_main):,} | Val rows: {len(val_main):,}")

Split date: 2015-06-05
Train rows: 789,557 | Val rows: 41,415


## 3. Tạo chuỗi thời gian cho LSTM

Xây dựng sliding window theo từng cửa hàng để dự báo doanh số ngày kế tiếp. Mỗi sequence gồm `SEQ_LEN` ngày gần nhất (đã chuẩn hóa).

In [12]:
SEQ_LEN = 30  # days
BATCH_SIZE = 256

def build_sequences(df: pd.DataFrame, feature_cols, target_col):
    sequences, targets = [], []
    for _, group in df.groupby(ID_COL):
        group = group.sort_values("Date")
        values = group[feature_cols + [target_col]].to_numpy()
        if len(values) <= SEQ_LEN:
            continue
        for start in range(len(values) - SEQ_LEN):
            seq_x = values[start:start + SEQ_LEN, :-1]
            seq_y = values[start + SEQ_LEN, -1]
            sequences.append(seq_x)
            targets.append(seq_y)
    return np.array(sequences, dtype=np.float32), np.array(targets, dtype=np.float32)

X_train, y_train = build_sequences(train_main, FEATURE_COLS, TARGET_COL)
X_val, y_val = build_sequences(val_main, FEATURE_COLS, TARGET_COL)

print(f"Train sequences: {X_train.shape}")
print(f"Val sequences  : {X_val.shape}")

Train sequences: (756107, 30, 29)
Val sequences  : (7977, 30, 29)


## 4. Dataset & DataLoader

Đóng gói sequence thành `torch.utils.data.Dataset` để dễ dàng tạo mini-batch và shuffle trong quá trình huấn luyện.

In [13]:
class SequenceDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

train_ds = SequenceDataset(X_train, y_train)
val_ds = SequenceDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

print(f"Batches -> train: {len(train_loader)}, val: {len(val_loader)}")

Batches -> train: 2954, val: 32


## 5. Kiến trúc LSTM

Mô hình nhiều tầng LSTM + fully-connected layer, dropout nhẹ để tránh overfitting. Output một bước dự báo cho doanh số đã chuẩn hóa.

In [14]:
INPUT_DIM = len(FEATURE_COLS)
HIDDEN_DIM = 128
NUM_LAYERS = 2
DROPOUT = 0.2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class SalesLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_hidden = lstm_out[:, -1, :]
        out = self.fc(self.dropout(last_hidden))
        return out.squeeze(-1)

model = SalesLSTM(INPUT_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT).to(DEVICE)
print(model)

SalesLSTM(
  (lstm): LSTM(29, 128, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)


## 6. Huấn luyện & đánh giá

Thiết lập optimizer AdamW, hàm mất mát MSE (MAE để theo dõi), kèm early stopping đơn giản dựa trên validation loss.

In [15]:
EPOCHS = 30
LR = 1e-3
PATIENCE = 5

criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)


def run_epoch(loader, train_mode=True):
    epoch_loss, epoch_mae = 0.0, 0.0
    steps = 0
    if train_mode:
        model.train()
    else:
        model.eval()
    for batch in loader:
        features, targets = [b.to(DEVICE) for b in batch]
        if train_mode:
            optimizer.zero_grad()
        preds = model(features)
        loss = criterion(preds, targets)
        mae = torch.mean(torch.abs(preds - targets))
        if train_mode:
            loss.backward()
            optimizer.step()
        epoch_loss += loss.item()
        epoch_mae += mae.item()
        steps += 1
    return epoch_loss / steps, epoch_mae / steps

best_loss = float("inf")
patience_counter = 0
history = []

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae = run_epoch(train_loader, train_mode=True)
    val_loss, val_mae = run_epoch(val_loader, train_mode=False)
    history.append({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss})
    print(f"Epoch {epoch:02d} | Train MSE {train_loss:.4f} | Val MSE {val_loss:.4f} | Val MAE {val_mae:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), DATA_DIR / "lstm_best.pt")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("Early stopping triggered.")
            break

Epoch 01 | Train MSE 0.1308 | Val MSE 0.0776 | Val MAE 0.2058
Epoch 02 | Train MSE 0.0799 | Val MSE 0.0803 | Val MAE 0.2111
Epoch 03 | Train MSE 0.0705 | Val MSE 0.0804 | Val MAE 0.2094
Epoch 04 | Train MSE 0.0656 | Val MSE 0.0868 | Val MAE 0.2191
Epoch 05 | Train MSE 0.0627 | Val MSE 0.0764 | Val MAE 0.2077
Epoch 06 | Train MSE 0.0606 | Val MSE 0.0775 | Val MAE 0.2070
Epoch 07 | Train MSE 0.0588 | Val MSE 0.0824 | Val MAE 0.2120
Epoch 08 | Train MSE 0.0574 | Val MSE 0.0773 | Val MAE 0.1984
Epoch 09 | Train MSE 0.0560 | Val MSE 0.0807 | Val MAE 0.2062
Epoch 10 | Train MSE 0.0550 | Val MSE 0.0822 | Val MAE 0.2101
Early stopping triggered.


## 7. Phục hồi giá trị thật & đánh giá cuối cùng

Nạp trọng số tốt nhất, suy luận trên tập validation rồi đảo chuẩn hóa để báo cáo MAE/RMSE theo thang đo doanh số thực tế.

In [22]:
best_path = DATA_DIR / "lstm_best.pt"
if best_path.exists():
    model.load_state_dict(torch.load(best_path, map_location=DEVICE))
model.eval()

all_preds, all_targets = [], []
with torch.no_grad():
    for features, targets in val_loader:
        features = features.to(DEVICE)
        preds = model(features).cpu().numpy()
        all_preds.append(preds)
        all_targets.append(targets.numpy())

val_preds_scaled = np.concatenate(all_preds)
val_targets_scaled = np.concatenate(all_targets)

val_preds = target_scaler.inverse_transform(val_preds_scaled.reshape(-1, 1)).ravel()
val_targets = target_scaler.inverse_transform(val_targets_scaled.reshape(-1, 1)).ravel()

mae = mean_absolute_error(val_targets, val_preds)
# Calculate RMSE manually as the 'squared' parameter might not be supported in older scikit-learn versions.
rmse = np.sqrt(mean_squared_error(val_targets, val_preds))

def calculate_mape(y_true, y_pred):
    non_zero_mask = y_true != 0
    if np.sum(non_zero_mask) == 0:
        return np.nan
    y_true_filtered = y_true[non_zero_mask]
    y_pred_filtered = y_pred[non_zero_mask]
    return np.mean(np.abs((y_true_filtered - y_pred_filtered) / y_true_filtered)) * 100

def calculate_rmspe(y_true, y_pred):
    non_zero_mask = y_true != 0
    if np.sum(non_zero_mask) == 0:
        return np.nan
    y_true_filtered = y_true[non_zero_mask]
    y_pred_filtered = y_pred[non_zero_mask]
    return np.sqrt(np.mean(((y_true_filtered - y_pred_filtered) / y_true_filtered)**2)) * 100

def calculate_smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Handle cases where both y_true and y_pred are zero to avoid 0/0
    ratio = np.where(denominator == 0, 0, np.abs(y_pred - y_true) / denominator)
    return np.mean(ratio) * 100

# For MASE calculation, get the original (unscaled) sales for the training period
original_train_sales_for_mase = train_data[train_data["Date"] < split_date][TARGET_COL].values

# Calculate the mean absolute difference of a naive forecast on the training data
# This is the scaling factor for MASE
if len(original_train_sales_for_mase) > 1:
    naive_forecast_error_train = np.mean(np.abs(original_train_sales_for_mase[1:] - original_train_sales_for_mase[:-1]))
else:
    naive_forecast_error_train = np.nan # Cannot calculate naive error for very short series

def calculate_mase(y_true, y_pred, naive_forecast_error_train_mean_abs_diff):
    if np.isnan(naive_forecast_error_train_mean_abs_diff) or naive_forecast_error_train_mean_abs_diff == 0:
        return np.nan # Cannot calculate MASE if naive error is zero or undefined
    mae_val = mean_absolute_error(y_true, y_pred)
    return mae_val / naive_forecast_error_train_mean_abs_diff

mape = calculate_mape(val_targets, val_preds)
rmspe = calculate_rmspe(val_targets, val_preds)
smape = calculate_smape(val_targets, val_preds)
mase = calculate_mase(val_targets, val_preds, naive_forecast_error_train)

print(f"Validation MAE : {mae:,.2f}")
print(f"Validation RMSE: {rmse:,.2f}")
print(f"Validation MAPE: {mape:,.2f}%")
print(f"Validation RMSPE: {rmspe:,.2f}%")
print(f"Validation sMAPE: {smape:,.2f}%")
print(f"Validation MASE: {mase:,.2f}")

Validation MAE : 640.18
Validation RMSE: 849.97
Validation MAPE: 9.09%
Validation RMSPE: 13.31%
Validation sMAPE: 9.17%
Validation MASE: 0.46


## 8. Suy luận trên tập test & lưu kết quả

Lấy chuỗi cuối cùng của từng cửa hàng trong test set, dự báo bước kế tiếp và trả lại thang đo gốc để nộp Kaggle.

In [20]:
def build_test_sequences(test_scaled_df: pd.DataFrame, historical_df: pd.DataFrame, feature_cols: list, seq_len: int):
    sequences, store_ids, forecast_dates = [], [], []

    # Iterate through each row in test_scaled_df, which contains the dates for prediction
    for index, row in test_scaled_df.iterrows():
        store_id = row[ID_COL]
        # The 'Date' column in test_scaled_df represents the date for which we want to predict Sales.
        # So, the sequence must end on the day *before* this date.
        forecast_date = row["Date"]
        end_date_for_sequence = forecast_date - pd.Timedelta(days=1)

        # Filter historical_df for the specific store and dates leading up to the forecast_date
        store_history = historical_df[
            (historical_df[ID_COL] == store_id) &
            (historical_df["Date"] <= end_date_for_sequence)
        ].sort_values("Date")

        if len(store_history) < seq_len:
            # Not enough history to form a sequence of SEQ_LEN, skip this store for this date
            # This might mean some stores in the test set won't have predictions if their history is too short.
            # A more robust solution might involve padding or using a shorter sequence, but for now, skip.
            # print(f"Skipping Store {store_id} for {forecast_date} due to insufficient history ({len(store_history)} < {seq_len})")
            continue

        # Get the last SEQ_LEN feature rows for the sequence
        seq_features = store_history[feature_cols].tail(seq_len).to_numpy()

        sequences.append(seq_features.astype(np.float32))
        store_ids.append(store_id)
        forecast_dates.append(forecast_date) # This is the actual date for which we are predicting

    return np.array(sequences), store_ids, forecast_dates

# Create the full scaled historical dataframe (all training data, with features scaled by feature_scaler)
# `train_data` here refers to the dataframe after `combined` split, which contains all original train rows.
full_scaled_historical_df = train_data.copy()
full_scaled_historical_df[FEATURE_COLS] = feature_scaler.transform(train_data[FEATURE_COLS])
full_scaled_historical_df = full_scaled_historical_df.sort_values([ID_COL, "Date"]).reset_index(drop=True)

# Call the corrected function
X_test_seq, test_store_ids, forecast_dates = build_test_sequences(
    test_scaled, # This contains the Store and Date for which we need to make predictions
    full_scaled_historical_df, # This contains the full historical data, scaled
    FEATURE_COLS,
    SEQ_LEN
)
print(f"Test sequences: {X_test_seq.shape}")

test_ds = SequenceDataset(X_test_seq)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

model.eval()
test_preds_scaled = []
with torch.no_grad():
    for batch in test_loader:
        preds = model(batch.to(DEVICE)).cpu().numpy()
        test_preds_scaled.append(preds)

test_preds = np.concatenate(test_preds_scaled)
test_preds = target_scaler.inverse_transform(test_preds.reshape(-1, 1)).ravel()

submission = pd.DataFrame({
    "Store": test_store_ids,
    "ForecastDate": forecast_dates,
    "PredictedSales": test_preds
})

output_path = DATA_DIR / "lstm_predictions.csv"
submission.to_csv(output_path, index=False)
print(f"Saved predictions to {output_path}")

Test sequences: (1113, 30, 29)
Saved predictions to /content/drive/MyDrive/ParquetFile/lstm_predictions.csv
