# M5 Walmart Sales Forecasting - Model Training

This notebook implements and trains multiple time series forecasting models on the M5 dataset.

## Models Implemented

1. **SARIMA (Seasonal ARIMA)**: Statistical time series model with seasonal components
2. **LSTM (Long Short-Term Memory)**: Deep learning model for sequence prediction
3. **Prophet**: Facebook's time series forecasting tool

## Training Strategy

- Use time-based train/validation split
- Focus on highest selling product for detailed modeling
- Implement proper evaluation metrics
- Save trained models for future use

In [None]:
# Import necessary libraries
import sys
import os
import warnings

# Add src to path for imports
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import joblib
from tqdm import tqdm

# Model imports
from src.models.sarima_model import SarimaModel
from src.models.lstm_model import LSTMModel
from src.models.prophet_model import ProphetModel

# Utility imports
from src.data.data_loader import M5DataLoader
from src.data.preprocessing import M5DataPreprocessor
from src.visualization.plots import M5Visualizer
from src.utils.config import get_config
from src.utils.logger import setup_logger
from src.utils.metrics import calculate_metrics

# Setup
warnings.filterwarnings('ignore')
logger = setup_logger('model_training')
config = get_config()

# Set random seeds for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print(f"Configuration loaded: {type(config)}")

## 1. Load Processed Data

In [None]:
# Load processed data
processed_data_path = config.get('data.processed_data_path', 'data/processed/')

print("Loading processed data...")

# Load full processed dataset
try:
    sales_processed = pd.read_parquet(f"{processed_data_path}/sales_processed.parquet")
    print(f"✅ Loaded processed sales data: {sales_processed.shape}")
except FileNotFoundError:
    print("❌ Processed data not found. Please run 02_feature_engineering.ipynb first.")
    raise

# Load time series data for highest selling product
try:
    product_ts = pd.read_csv(f"{processed_data_path}/highest_selling_product_ts.csv", 
                            index_col=0, parse_dates=True)
    print(f"✅ Loaded product time series: {product_ts.shape}")
except FileNotFoundError:
    print("❌ Product time series not found. Using fallback method...")
    # Fallback: prepare data manually
    preprocessor = M5DataPreprocessor()
    highest_selling_product = preprocessor.get_highest_selling_product(sales_processed)
    product_ts = preprocessor.prepare_time_series_data(sales_processed, highest_selling_product)
    print(f"✅ Prepared product time series: {product_ts.shape}")

# Load feature metadata
import json
try:
    with open(f"{processed_data_path}/feature_metadata.json", 'r') as f:
        feature_metadata = json.load(f)
    highest_selling_product = feature_metadata['highest_selling_product']
    print(f"✅ Loaded metadata. Product: {highest_selling_product}")
except FileNotFoundError:
    print("❌ Feature metadata not found. Using default highest selling product.")
    preprocessor = M5DataPreprocessor()
    highest_selling_product = preprocessor.get_highest_selling_product(sales_processed)

print(f"\nTraining on product: {highest_selling_product}")
print(f"Time series date range: {product_ts.index.min()} to {product_ts.index.max()}")

## 2. Data Preparation and Train/Test Split

In [None]:
# Prepare time series data
print("Preparing data for modeling...")

# Remove any rows with missing sales values
product_ts_clean = product_ts.dropna(subset=['sales'])
print(f"Clean time series shape: {product_ts_clean.shape}")

# Define train/validation/test split dates
train_end_date = '2016-03-27'  # Roughly 80% of data for training
val_end_date = '2016-04-24'    # Next 28 days for validation
# Test period: remaining dates

print(f"Data split strategy:")
print(f"  Training period: {product_ts_clean.index.min()} to {train_end_date}")
print(f"  Validation period: {train_end_date} to {val_end_date}")
print(f"  Test period: {val_end_date} to {product_ts_clean.index.max()}")

# Create splits
train_data = product_ts_clean[product_ts_clean.index <= train_end_date]
val_data = product_ts_clean[(product_ts_clean.index > train_end_date) & 
                          (product_ts_clean.index <= val_end_date)]
test_data = product_ts_clean[product_ts_clean.index > val_end_date]

print(f"\nData split sizes:")
print(f"  Training: {len(train_data)} days")
print(f"  Validation: {len(val_data)} days")
print(f"  Test: {len(test_data)} days")

# Visualize the splits
plt.figure(figsize=(15, 6))
plt.plot(train_data.index, train_data['sales'], label='Training', alpha=0.8)
plt.plot(val_data.index, val_data['sales'], label='Validation', alpha=0.8)
plt.plot(test_data.index, test_data['sales'], label='Test', alpha=0.8)
plt.axvline(x=pd.to_datetime(train_end_date), color='red', linestyle='--', alpha=0.7, label='Train/Val Split')
plt.axvline(x=pd.to_datetime(val_end_date), color='orange', linestyle='--', alpha=0.7, label='Val/Test Split')
plt.title(f'Time Series Data Splits - {highest_selling_product}')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Model 1: SARIMA (Seasonal ARIMA)

In [None]:
# Initialize SARIMA model
print("\n🎯 TRAINING SARIMA MODEL")
print("=" * 30)

sarima_model = SarimaModel()

# Prepare data for SARIMA (needs just the time series)
y_train = train_data['sales'].values
y_val = val_data['sales'].values
y_test = test_data['sales'].values

print(f"Training SARIMA on {len(y_train)} observations...")

# Train the model
try:
    sarima_model.fit(y_train, seasonal_periods=7)  # Weekly seasonality
    print("✅ SARIMA model trained successfully!")
    
    # Print model summary
    print("\nModel Summary:")
    print(sarima_model.get_model_summary())
    
except Exception as e:
    print(f"❌ SARIMA training failed: {e}")
    print("Trying with simpler parameters...")
    
    # Fallback with simpler parameters
    sarima_model = SarimaModel(order=(1,1,1), seasonal_order=(1,1,1,7))
    sarima_model.fit(y_train)
    print("✅ SARIMA model trained with fallback parameters!")

In [None]:
# Make predictions with SARIMA
print("Making SARIMA predictions...")

# Predict on validation set
sarima_val_pred = sarima_model.predict(len(y_val))
print(f"Validation predictions shape: {sarima_val_pred.shape}")

# Predict on test set
# For SARIMA, we need to retrain on train+val for test predictions
y_train_val = np.concatenate([y_train, y_val])
sarima_model_full = SarimaModel()
sarima_model_full.fit(y_train_val, seasonal_periods=7)
sarima_test_pred = sarima_model_full.predict(len(y_test))
print(f"Test predictions shape: {sarima_test_pred.shape}")

# Calculate metrics
sarima_val_metrics = calculate_metrics(y_val, sarima_val_pred)
sarima_test_metrics = calculate_metrics(y_test, sarima_test_pred)

print("\n📊 SARIMA Performance:")
print(f"Validation - RMSE: {sarima_val_metrics['rmse']:.3f}, MAE: {sarima_val_metrics['mae']:.3f}, MAPE: {sarima_val_metrics['mape']:.3f}%")
print(f"Test - RMSE: {sarima_test_metrics['rmse']:.3f}, MAE: {sarima_test_metrics['mae']:.3f}, MAPE: {sarima_test_metrics['mape']:.3f}%")

In [None]:
# Visualize SARIMA results
plt.figure(figsize=(15, 8))

# Plot training data
plt.plot(train_data.index, train_data['sales'], label='Training Data', alpha=0.7)

# Plot validation data and predictions
plt.plot(val_data.index, val_data['sales'], label='Validation Actual', color='green')
plt.plot(val_data.index, sarima_val_pred, label='SARIMA Validation Pred', color='red', linestyle='--')

# Plot test data and predictions
plt.plot(test_data.index, test_data['sales'], label='Test Actual', color='blue')
plt.plot(test_data.index, sarima_test_pred, label='SARIMA Test Pred', color='orange', linestyle='--')

plt.axvline(x=pd.to_datetime(train_end_date), color='red', linestyle=':', alpha=0.7)
plt.axvline(x=pd.to_datetime(val_end_date), color='orange', linestyle=':', alpha=0.7)

plt.title('SARIMA Model Predictions')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4. Model 2: LSTM (Long Short-Term Memory)

In [None]:
# Initialize LSTM model
print("\n🤖 TRAINING LSTM MODEL")
print("=" * 25)

# Prepare features for LSTM
feature_columns = ['sales']
if 'lag_1' in train_data.columns:
    feature_columns.extend(['lag_1', 'lag_7', 'rolling_mean_7'])
if 'day_of_week' in train_data.columns:
    feature_columns.extend(['day_of_week', 'month'])

# Remove features with missing values
available_features = [col for col in feature_columns if col in train_data.columns]
print(f"Using features for LSTM: {available_features}")

# Prepare data for LSTM
X_train = train_data[available_features].fillna(0).values
X_val = val_data[available_features].fillna(0).values
X_test = test_data[available_features].fillna(0).values

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")
print(f"Test data shape: {X_test.shape}")

# Initialize LSTM model
lstm_model = LSTMModel(
    sequence_length=28,  # Use 28 days of history
    n_features=len(available_features),
    lstm_units=50,
    epochs=50,
    batch_size=32
)

print("Training LSTM model...")
try:
    # Train the model
    lstm_model.fit(X_train, y_train)
    print("✅ LSTM model trained successfully!")
    
except Exception as e:
    print(f"❌ LSTM training failed: {e}")
    print("Note: LSTM requires TensorFlow/Keras. Install with: pip install tensorflow")
    lstm_model = None

In [None]:
# Make predictions with LSTM (if model was trained successfully)
if lstm_model is not None:
    print("Making LSTM predictions...")
    
    # Predict on validation set
    lstm_val_pred = lstm_model.predict(X_val)
    print(f"Validation predictions shape: {lstm_val_pred.shape}")
    
    # Predict on test set
    # Retrain on train+val for test predictions
    X_train_val = np.concatenate([X_train, X_val])
    y_train_val = np.concatenate([y_train, y_val])
    
    lstm_model_full = LSTMModel(
        sequence_length=28,
        n_features=len(available_features),
        lstm_units=50,
        epochs=30,  # Fewer epochs for retraining
        batch_size=32
    )
    lstm_model_full.fit(X_train_val, y_train_val)
    lstm_test_pred = lstm_model_full.predict(X_test)
    print(f"Test predictions shape: {lstm_test_pred.shape}")
    
    # Calculate metrics
    lstm_val_metrics = calculate_metrics(y_val, lstm_val_pred)
    lstm_test_metrics = calculate_metrics(y_test, lstm_test_pred)
    
    print("\n📊 LSTM Performance:")
    print(f"Validation - RMSE: {lstm_val_metrics['rmse']:.3f}, MAE: {lstm_val_metrics['mae']:.3f}, MAPE: {lstm_val_metrics['mape']:.3f}%")
    print(f"Test - RMSE: {lstm_test_metrics['rmse']:.3f}, MAE: {lstm_test_metrics['mae']:.3f}, MAPE: {lstm_test_metrics['mape']:.3f}%")
    
else:
    print("❌ LSTM model not available. Skipping predictions.")
    lstm_val_pred = np.zeros_like(y_val)
    lstm_test_pred = np.zeros_like(y_test)
    lstm_val_metrics = {'rmse': np.inf, 'mae': np.inf, 'mape': np.inf}
    lstm_test_metrics = {'rmse': np.inf, 'mae': np.inf, 'mape': np.inf}

In [None]:
# Visualize LSTM results (if available)
if lstm_model is not None:
    plt.figure(figsize=(15, 8))
    
    # Plot training data
    plt.plot(train_data.index, train_data['sales'], label='Training Data', alpha=0.7)
    
    # Plot validation data and predictions
    plt.plot(val_data.index, val_data['sales'], label='Validation Actual', color='green')
    plt.plot(val_data.index, lstm_val_pred, label='LSTM Validation Pred', color='red', linestyle='--')
    
    # Plot test data and predictions
    plt.plot(test_data.index, test_data['sales'], label='Test Actual', color='blue')
    plt.plot(test_data.index, lstm_test_pred, label='LSTM Test Pred', color='orange', linestyle='--')
    
    plt.axvline(x=pd.to_datetime(train_end_date), color='red', linestyle=':', alpha=0.7)
    plt.axvline(x=pd.to_datetime(val_end_date), color='orange', linestyle=':', alpha=0.7)
    
    plt.title('LSTM Model Predictions')
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("LSTM visualization skipped - model not available")

## 5. Model 3: Prophet

In [None]:
# Initialize Prophet model
print("\n📈 TRAINING PROPHET MODEL")
print("=" * 28)

# Prepare data for Prophet (needs 'ds' and 'y' columns)
prophet_train = pd.DataFrame({
    'ds': train_data.index,
    'y': train_data['sales'].values
})

prophet_val = pd.DataFrame({
    'ds': val_data.index,
    'y': val_data['sales'].values
})

prophet_test = pd.DataFrame({
    'ds': test_data.index,
    'y': test_data['sales'].values
})

print(f"Prophet training data shape: {prophet_train.shape}")

# Initialize Prophet model
prophet_model = ProphetModel(
    daily_seasonality=True,
    weekly_seasonality=True,
    yearly_seasonality=True
)

print("Training Prophet model...")
try:
    # Train the model
    prophet_model.fit(prophet_train)
    print("✅ Prophet model trained successfully!")
    
except Exception as e:
    print(f"❌ Prophet training failed: {e}")
    print("Note: Prophet requires specific installation. Install with: pip install prophet")
    prophet_model = None

In [None]:
# Make predictions with Prophet (if model was trained successfully)
if prophet_model is not None:
    print("Making Prophet predictions...")
    
    # Predict on validation set
    prophet_val_pred = prophet_model.predict(len(y_val))['yhat'].values
    print(f"Validation predictions shape: {prophet_val_pred.shape}")
    
    # Predict on test set
    # Retrain on train+val for test predictions
    prophet_train_val = pd.DataFrame({
        'ds': pd.concat([train_data, val_data]).index,
        'y': np.concatenate([y_train, y_val])
    })
    
    prophet_model_full = ProphetModel(
        daily_seasonality=True,
        weekly_seasonality=True,
        yearly_seasonality=True
    )
    prophet_model_full.fit(prophet_train_val)
    prophet_test_pred = prophet_model_full.predict(len(y_test))['yhat'].values
    print(f"Test predictions shape: {prophet_test_pred.shape}")
    
    # Calculate metrics
    prophet_val_metrics = calculate_metrics(y_val, prophet_val_pred)
    prophet_test_metrics = calculate_metrics(y_test, prophet_test_pred)
    
    print("\n📊 Prophet Performance:")
    print(f"Validation - RMSE: {prophet_val_metrics['rmse']:.3f}, MAE: {prophet_val_metrics['mae']:.3f}, MAPE: {prophet_val_metrics['mape']:.3f}%")
    print(f"Test - RMSE: {prophet_test_metrics['rmse']:.3f}, MAE: {prophet_test_metrics['mae']:.3f}, MAPE: {prophet_test_metrics['mape']:.3f}%")
    
else:
    print("❌ Prophet model not available. Skipping predictions.")
    prophet_val_pred = np.zeros_like(y_val)
    prophet_test_pred = np.zeros_like(y_test)
    prophet_val_metrics = {'rmse': np.inf, 'mae': np.inf, 'mape': np.inf}
    prophet_test_metrics = {'rmse': np.inf, 'mae': np.inf, 'mape': np.inf}

In [None]:
# Visualize Prophet results (if available)
if prophet_model is not None:
    plt.figure(figsize=(15, 8))
    
    # Plot training data
    plt.plot(train_data.index, train_data['sales'], label='Training Data', alpha=0.7)
    
    # Plot validation data and predictions
    plt.plot(val_data.index, val_data['sales'], label='Validation Actual', color='green')
    plt.plot(val_data.index, prophet_val_pred, label='Prophet Validation Pred', color='red', linestyle='--')
    
    # Plot test data and predictions
    plt.plot(test_data.index, test_data['sales'], label='Test Actual', color='blue')
    plt.plot(test_data.index, prophet_test_pred, label='Prophet Test Pred', color='orange', linestyle='--')
    
    plt.axvline(x=pd.to_datetime(train_end_date), color='red', linestyle=':', alpha=0.7)
    plt.axvline(x=pd.to_datetime(val_end_date), color='orange', linestyle=':', alpha=0.7)
    
    plt.title('Prophet Model Predictions')
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Prophet visualization skipped - model not available")

## 6. Model Comparison and Evaluation

In [None]:
# Compile results
print("\n🏆 MODEL COMPARISON")
print("=" * 40)

# Create results dataframe
results_data = {
    'Model': ['SARIMA', 'LSTM', 'Prophet'],
    'Validation_RMSE': [sarima_val_metrics['rmse'], lstm_val_metrics['rmse'], prophet_val_metrics['rmse']],
    'Validation_MAE': [sarima_val_metrics['mae'], lstm_val_metrics['mae'], prophet_val_metrics['mae']],
    'Validation_MAPE': [sarima_val_metrics['mape'], lstm_val_metrics['mape'], prophet_val_metrics['mape']],
    'Test_RMSE': [sarima_test_metrics['rmse'], lstm_test_metrics['rmse'], prophet_test_metrics['rmse']],
    'Test_MAE': [sarima_test_metrics['mae'], lstm_test_metrics['mae'], prophet_test_metrics['mae']],
    'Test_MAPE': [sarima_test_metrics['mape'], lstm_test_metrics['mape'], prophet_test_metrics['mape']]
}

results_df = pd.DataFrame(results_data)
print(results_df.to_string(index=False, float_format='%.3f'))

# Find best model
best_model_idx = results_df['Test_RMSE'].idxmin()
best_model_name = results_df.loc[best_model_idx, 'Model']
print(f"\n🥇 Best performing model: {best_model_name}")
print(f"   Test RMSE: {results_df.loc[best_model_idx, 'Test_RMSE']:.3f}")
print(f"   Test MAE: {results_df.loc[best_model_idx, 'Test_MAE']:.3f}")
print(f"   Test MAPE: {results_df.loc[best_model_idx, 'Test_MAPE']:.3f}%")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Validation RMSE
axes[0,0].bar(results_df['Model'], results_df['Validation_RMSE'])
axes[0,0].set_title('Validation RMSE')
axes[0,0].set_ylabel('RMSE')

# Test RMSE
axes[0,1].bar(results_df['Model'], results_df['Test_RMSE'])
axes[0,1].set_title('Test RMSE')
axes[0,1].set_ylabel('RMSE')

# Validation MAPE
axes[1,0].bar(results_df['Model'], results_df['Validation_MAPE'])
axes[1,0].set_title('Validation MAPE')
axes[1,0].set_ylabel('MAPE (%)')

# Test MAPE
axes[1,1].bar(results_df['Model'], results_df['Test_MAPE'])
axes[1,1].set_title('Test MAPE')
axes[1,1].set_ylabel('MAPE (%)')

plt.tight_layout()
plt.show()

In [None]:
# Compare all predictions on test set
plt.figure(figsize=(15, 8))

# Plot actual values
plt.plot(test_data.index, test_data['sales'], label='Actual', color='black', linewidth=2)

# Plot predictions from all models
plt.plot(test_data.index, sarima_test_pred, label='SARIMA', alpha=0.8, linestyle='--')
if lstm_model is not None:
    plt.plot(test_data.index, lstm_test_pred, label='LSTM', alpha=0.8, linestyle='--')
if prophet_model is not None:
    plt.plot(test_data.index, prophet_test_pred, label='Prophet', alpha=0.8, linestyle='--')

plt.title('Model Predictions Comparison - Test Set')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Save Trained Models

In [None]:
# Create models directory
models_dir = config.get('models.model_path', 'models/')
os.makedirs(models_dir, exist_ok=True)

print("Saving trained models...")

# Save SARIMA model
try:
    sarima_model.save_model(f"{models_dir}/sarima_model.pkl")
    print(f"✅ SARIMA model saved to {models_dir}/sarima_model.pkl")
except Exception as e:
    print(f"❌ Failed to save SARIMA model: {e}")

# Save LSTM model
if lstm_model is not None:
    try:
        lstm_model.save_model(f"{models_dir}/lstm_model.h5")
        print(f"✅ LSTM model saved to {models_dir}/lstm_model.h5")
    except Exception as e:
        print(f"❌ Failed to save LSTM model: {e}")
else:
    print("⚠️  LSTM model not available for saving")

# Save Prophet model
if prophet_model is not None:
    try:
        prophet_model.save_model(f"{models_dir}/prophet_model.pkl")
        print(f"✅ Prophet model saved to {models_dir}/prophet_model.pkl")
    except Exception as e:
        print(f"❌ Failed to save Prophet model: {e}")
else:
    print("⚠️  Prophet model not available for saving")

# Save results summary
results_df.to_csv(f"{models_dir}/model_comparison_results.csv", index=False)
print(f"✅ Results summary saved to {models_dir}/model_comparison_results.csv")

In [None]:
# Save detailed training metadata
training_metadata = {
    'product_id': highest_selling_product,
    'training_period': {
        'start': str(train_data.index.min()),
        'end': str(train_data.index.max()),
        'days': len(train_data)
    },
    'validation_period': {
        'start': str(val_data.index.min()),
        'end': str(val_data.index.max()),
        'days': len(val_data)
    },
    'test_period': {
        'start': str(test_data.index.min()),
        'end': str(test_data.index.max()),
        'days': len(test_data)
    },
    'model_performance': results_df.to_dict('records'),
    'best_model': best_model_name,
    'features_used': available_features,
    'data_splits': {
        'train_end_date': train_end_date,
        'val_end_date': val_end_date
    }
}

with open(f"{models_dir}/training_metadata.json", 'w') as f:
    json.dump(training_metadata, f, indent=2, default=str)

print(f"✅ Training metadata saved to {models_dir}/training_metadata.json")

## 8. Training Summary

In [None]:
print("\n🎯 MODEL TRAINING COMPLETE!")
print("=" * 40)

print(f"\n📊 TRAINED ON PRODUCT: {highest_selling_product}")
print(f"   Training period: {len(train_data)} days")
print(f"   Validation period: {len(val_data)} days")
print(f"   Test period: {len(test_data)} days")

print("\n🤖 MODELS TRAINED:")
models_trained = []
if sarima_model:
    models_trained.append("✅ SARIMA")
if lstm_model:
    models_trained.append("✅ LSTM")
else:
    models_trained.append("❌ LSTM (TensorFlow required)")
if prophet_model:
    models_trained.append("✅ Prophet")
else:
    models_trained.append("❌ Prophet (Prophet package required)")

for model in models_trained:
    print(f"   {model}")

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"   Test RMSE: {results_df.loc[best_model_idx, 'Test_RMSE']:.3f}")
print(f"   Test MAPE: {results_df.loc[best_model_idx, 'Test_MAPE']:.3f}%")

print("\n💾 FILES SAVED:")
print(f"   • {models_dir}/sarima_model.pkl")
if lstm_model:
    print(f"   • {models_dir}/lstm_model.h5")
if prophet_model:
    print(f"   • {models_dir}/prophet_model.pkl")
print(f"   • {models_dir}/model_comparison_results.csv")
print(f"   • {models_dir}/training_metadata.json")

print("\n🔧 TO INSTALL MISSING DEPENDENCIES:")
if lstm_model is None:
    print("   pip install tensorflow")
if prophet_model is None:
    print("   pip install prophet")

print("\n➡️  Next step: Run 04_model_evaluation.ipynb for detailed analysis")