# Rossmann Sales â€” LSTM Modeling

This notebook builds an LSTM model for sales forecasting using the processed dataset saved from the EDA notebook. It creates lag/rolling features, windowed sequences, performs time-based splits, trains with early stopping, evaluates, plots results, and saves artifacts.


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

RAW_DIR = Path('data/raw')
PROC_DIR = Path('data/processed')
MODEL_DIR = Path('models'); MODEL_DIR.mkdir(parents=True, exist_ok=True)
REPORT_DIR = Path('reports/figures'); REPORT_DIR.mkdir(parents=True, exist_ok=True)

proc_path = PROC_DIR / 'rossmann_processed.csv'
assert proc_path.exists(), f"Processed dataset not found at {proc_path}. Run 01_data_loading_eda.ipynb first."

df = pd.read_csv(proc_path, parse_dates=['Date'])
df = df.sort_values(['Store','Date']).reset_index(drop=True)
print(df.shape)
df.head()


In [None]:
# Feature engineering: create lags and rolling means per store

def add_lags_and_rolls(frame: pd.DataFrame, target_col: str, lags=(1,7,14,28), roll_windows=(7,28)) -> pd.DataFrame:
    g = frame.copy()
    for L in lags:
        g[f"lag_{L}"] = g.groupby('Store')[target_col].shift(L)
    for w in roll_windows:
        g[f"roll{w}"] = g.groupby('Store')[target_col].shift(1).rolling(w).mean()
    return g

# Apply on log target
feat_df = add_lags_and_rolls(df, 'Sales_log')
feat_df = feat_df.dropna().reset_index(drop=True)

feature_cols = ['dow','month','year','week','Open','Promo','SchoolHoliday'] + \
               [f'lag_{L}' for L in (1,7,14,28)] + [f'roll{w}' for w in (7,28)]

y = feat_df['Sales_log'].values.astype('float32')
X_num = feat_df[feature_cols].values.astype('float32')

scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
X_num_scaled[:2], y[:2]


In [None]:
# Create sequences per store (lookback -> next day)
from typing import Tuple

def make_sequences_by_store(frame: pd.DataFrame, X_scaled: np.ndarray, target: np.ndarray, feature_cols, lookback: int = 28, horizon: int = 1) -> Tuple[np.ndarray, np.ndarray]:
    X_list, y_list = [], []
    start_idx = 0
    for store_id, g in frame.groupby('Store'):
        g = g.sort_values('Date')
        n = len(g)
        Xg = X_scaled[start_idx:start_idx+n]
        yg = target[start_idx:start_idx+n]
        for i in range(lookback, n - horizon + 1):
            X_list.append(Xg[i - lookback:i, :])
            y_list.append(yg[i:i + horizon])
        start_idx += n
    return np.array(X_list, dtype=np.float32), np.array(y_list, dtype=np.float32)

lookback, horizon = 28, 1
X_seq, y_seq = make_sequences_by_store(feat_df, X_num_scaled, y, feature_cols, lookback, horizon)
X_seq.shape, y_seq.shape


In [None]:
# Time-based split: use the last N sequences as test, and a validation split from the remainder
n = len(X_seq)
print('Total sequences:', n)

test_size = int(0.1 * n)
val_size = int(0.1 * (n - test_size))

X_train_full, y_train_full = X_seq[: n - test_size], y_seq[: n - test_size]
X_test, y_test = X_seq[n - test_size :], y_seq[n - test_size :]

X_train, y_train = X_train_full[: n - test_size - val_size], y_train_full[: n - test_size - val_size]
X_val, y_val = X_train_full[n - test_size - val_size :], y_train_full[n - test_size - val_size :]

list(map(lambda a: a.shape, [X_train, y_train, X_val, y_val, X_test, y_test]))


In [None]:
# Build the LSTM model
n_features = X_seq.shape[-1]

def build_model(units=64, dropout=0.2, lr=1e-3, lookback=28, n_features=None, horizon=1):
    inputs = keras.Input(shape=(lookback, n_features))
    x = layers.LSTM(units, return_sequences=False)(inputs)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(horizon)(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), loss='mse', metrics=['mae'])
    return model

model = build_model(64, 0.2, 1e-3, lookback, n_features, horizon)
model.summary()


In [None]:
# Train with early stopping
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_mae', patience=5, restore_best_weights=True)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=64,
    verbose=1,
    callbacks=callbacks
)

# Plot training curves
plt.figure(figsize=(10,3))
plt.plot(history.history['mae'], label='train mae')
plt.plot(history.history['val_mae'], label='val mae')
plt.legend(); plt.title('Training curves (MAE)'); plt.show()


In [None]:
# Evaluate on test
pred_test = model.predict(X_test, verbose=0).squeeze()

y_true = np.expm1(y_test.squeeze())
y_pred = np.expm1(pred_test)

mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
print({'test_mae': mae, 'test_rmse': rmse})

# Plot
plt.figure(figsize=(12,4))
plt.plot(y_true[:500], label='Actual')
plt.plot(y_pred[:500], label='Predicted')
plt.legend(); plt.title('Test set (first 500)'); plt.tight_layout()
plot_path = REPORT_DIR / 'lstm_test_plot.png'
plt.savefig(plot_path, dpi=150)
print('Saved plot to', plot_path)


In [None]:
# Save artifacts
model_path = MODEL_DIR / 'rossmann_lstm.keras'
model.save(model_path)
print('Saved model to', model_path)

# Save scaler for inference
import joblib
scaler_path = MODEL_DIR / 'feature_scaler.pkl'
joblib.dump(scaler, scaler_path)
print('Saved scaler to', scaler_path)

# Save predictions
preds_path = MODEL_DIR / 'test_predictions.csv'
pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}).to_csv(preds_path, index=False)
print('Saved predictions to', preds_path)
