# M5 Walmart Sales Forecasting - Feature Engineering

This notebook performs comprehensive feature engineering on the M5 dataset to prepare data for time series modeling.

## Feature Engineering Strategy

1. **Lag Features**: Previous sales values
2. **Rolling Statistics**: Moving averages and standard deviations
3. **Temporal Features**: Date-based features
4. **Price Features**: Price changes and trends
5. **Event Features**: Holiday and event indicators
6. **Hierarchical Features**: Category and store aggregations

In [None]:
# Import necessary libraries
import sys
import os

# Add src to path for imports
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm

from src.data.data_loader import M5DataLoader
from src.data.preprocessing import M5DataPreprocessor
from src.visualization.plots import M5Visualizer
from src.utils.config import get_config
from src.utils.logger import setup_logger

# Setup
warnings.filterwarnings('ignore')
logger = setup_logger('feature_engineering')
config = get_config()

# Initialize preprocessor and visualizer
preprocessor = M5DataPreprocessor()
visualizer = M5Visualizer()

print("Libraries imported successfully!")

## 1. Load and Prepare Data

In [None]:
# Load data
data_path = config.get('data.raw_data_path', 'data/raw/')
loader = M5DataLoader(data_path)

calendar, sales, prices = loader.load_all_data()
print("Data loaded successfully!")

# Display basic information
print(f"Sales data shape: {sales.shape}")
print(f"Calendar data shape: {calendar.shape}")
print(f"Prices data shape: {prices.shape}")

In [None]:
# Reshape sales data to long format
print("Reshaping sales data to long format...")
sales_long = preprocessor.reshape_sales_data(sales, calendar)

print(f"Reshaped data shape: {sales_long.shape}")
print("\nSample of reshaped data:")
print(sales_long.head())

## 2. Create Lag Features

In [None]:
# Get lag configuration
lag_features = config.get('features.lag_features', [1, 2, 3, 7, 14, 28])
print(f"Creating lag features for lags: {lag_features}")

# Create lag features
sales_with_lags = preprocessor.create_lag_features(
    sales_long, 
    target_col='sales', 
    lags=lag_features
)

print(f"Data shape after adding lag features: {sales_with_lags.shape}")
print("\nLag features created:")
lag_cols = [col for col in sales_with_lags.columns if col.startswith('lag_')]
print(lag_cols)

In [None]:
# Check lag features for a sample item
sample_item = sales_with_lags['id'].iloc[0]
sample_data = sales_with_lags[sales_with_lags['id'] == sample_item].head(10)

print(f"Lag features for sample item {sample_item}:")
display_cols = ['date', 'sales'] + lag_cols
print(sample_data[display_cols])

## 3. Create Rolling Features

In [None]:
# Get rolling window configuration
rolling_windows = config.get('features.rolling_windows', [7, 14, 28])
print(f"Creating rolling features for windows: {rolling_windows}")

# Create rolling features
sales_with_rolling = preprocessor.create_rolling_features(
    sales_with_lags,
    target_col='sales',
    windows=rolling_windows
)

print(f"Data shape after adding rolling features: {sales_with_rolling.shape}")
print("\nRolling features created:")
rolling_cols = [col for col in sales_with_rolling.columns if col.startswith('rolling_')]
print(rolling_cols)

In [None]:
# Visualize rolling features for a sample item
sample_data_rolling = sales_with_rolling[sales_with_rolling['id'] == sample_item].iloc[50:150]

plt.figure(figsize=(15, 8))
plt.plot(sample_data_rolling['date'], sample_data_rolling['sales'], label='Actual Sales', alpha=0.7)
plt.plot(sample_data_rolling['date'], sample_data_rolling['rolling_mean_7'], label='7-day MA')
plt.plot(sample_data_rolling['date'], sample_data_rolling['rolling_mean_14'], label='14-day MA')
plt.plot(sample_data_rolling['date'], sample_data_rolling['rolling_mean_28'], label='28-day MA')

plt.title(f'Rolling Averages for {sample_item}')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4. Create Temporal Features

In [None]:
# Create temporal features
print("Creating temporal features...")
sales_with_temporal = preprocessor.create_temporal_features(sales_with_rolling)

print(f"Data shape after adding temporal features: {sales_with_temporal.shape}")
print("\nTemporal features created:")
temporal_cols = ['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 
                'month', 'quarter', 'year', 'is_weekend', 'is_month_start', 'is_month_end']
print([col for col in temporal_cols if col in sales_with_temporal.columns])

In [None]:
# Analyze temporal patterns
print("Temporal Pattern Analysis:")
print("=" * 30)

# Sales by day of week
dow_sales = sales_with_temporal.groupby('day_of_week')['sales'].mean()
print("\nAverage sales by day of week:")
dow_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
for dow, sales_avg in dow_sales.items():
    print(f"{dow_names[dow]}: {sales_avg:.2f}")

# Sales by month
month_sales = sales_with_temporal.groupby('month')['sales'].mean()
print("\nAverage sales by month:")
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for month, sales_avg in month_sales.items():
    print(f"{month_names[month-1]}: {sales_avg:.2f}")

In [None]:
# Visualize temporal patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Day of week pattern
dow_sales.plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Average Sales by Day of Week')
axes[0,0].set_xlabel('Day of Week (0=Monday)')
axes[0,0].set_ylabel('Average Sales')

# Month pattern
month_sales.plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('Average Sales by Month')
axes[0,1].set_xlabel('Month')
axes[0,1].set_ylabel('Average Sales')

# Weekend vs weekday
weekend_sales = sales_with_temporal.groupby('is_weekend')['sales'].mean()
weekend_sales.plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Average Sales: Weekday vs Weekend')
axes[1,0].set_xlabel('Is Weekend (0=No, 1=Yes)')
axes[1,0].set_ylabel('Average Sales')

# Quarter pattern
quarter_sales = sales_with_temporal.groupby('quarter')['sales'].mean()
quarter_sales.plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Average Sales by Quarter')
axes[1,1].set_xlabel('Quarter')
axes[1,1].set_ylabel('Average Sales')

plt.tight_layout()
plt.show()

## 5. Create Price Features

In [None]:
# Create price features
print("Creating price features...")
sales_with_prices = preprocessor.create_price_features(sales_with_temporal, prices)

print(f"Data shape after adding price features: {sales_with_prices.shape}")
print("\nPrice features created:")
price_cols = [col for col in sales_with_prices.columns if 'price' in col.lower()]
print(price_cols)

# Check price coverage
price_coverage = sales_with_prices['sell_price'].notna().mean()
print(f"\nPrice coverage: {price_coverage:.2%}")

In [None]:
# Analyze price-sales relationship
print("Price-Sales Relationship Analysis:")
print("=" * 35)

# Correlation between price and sales
price_sales_corr = sales_with_prices[['sales', 'sell_price']].corr().iloc[0,1]
print(f"Correlation between price and sales: {price_sales_corr:.3f}")

# Price elasticity analysis
price_change_sales_corr = sales_with_prices[['sales', 'price_change_pct']].corr().iloc[0,1]
print(f"Correlation between sales and price change %: {price_change_sales_corr:.3f}")

# Visualize price-sales relationship
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
sample_price_data = sales_with_prices[sales_with_prices['sell_price'].notna()].sample(10000)
plt.scatter(sample_price_data['sell_price'], sample_price_data['sales'], alpha=0.1)
plt.xlabel('Price')
plt.ylabel('Sales')
plt.title('Price vs Sales Relationship')

plt.subplot(1, 2, 2)
price_change_data = sales_with_prices[sales_with_prices['price_change_pct'].notna()].sample(10000)
plt.scatter(price_change_data['price_change_pct'], price_change_data['sales'], alpha=0.1)
plt.xlabel('Price Change %')
plt.ylabel('Sales')
plt.title('Price Change vs Sales')

plt.tight_layout()
plt.show()

## 6. Create Event Features

In [None]:
# Create event features
print("Creating event features...")
sales_with_events = preprocessor.create_event_features(sales_with_prices)

print(f"Data shape after adding event features: {sales_with_events.shape}")
print("\nEvent features created:")
event_cols = [col for col in sales_with_events.columns if 'event' in col.lower() or 'snap' in col.lower()]
print(event_cols)

In [None]:
# Analyze event impact on sales
print("Event Impact Analysis:")
print("=" * 25)

# Sales with and without events
event_sales = sales_with_events.groupby('has_any_event')['sales'].mean()
print("Average sales:")
print(f"  Without events: {event_sales[0]:.2f}")
print(f"  With events: {event_sales[1]:.2f}")
print(f"  Event uplift: {(event_sales[1]/event_sales[0] - 1)*100:.1f}%")

# SNAP impact
snap_sales = sales_with_events.groupby('snap_any')['sales'].mean()
print("\nSNAP benefits impact:")
print(f"  Without SNAP: {snap_sales[0]:.2f}")
print(f"  With SNAP: {snap_sales[1]:.2f}")
print(f"  SNAP uplift: {(snap_sales[1]/snap_sales[0] - 1)*100:.1f}%")

# Event type impact
print("\nEvent type impact:")
event_type_cols = [col for col in sales_with_events.columns if col.startswith('event_type_')]
for col in event_type_cols:
    event_type_sales = sales_with_events.groupby(col)['sales'].mean()
    if len(event_type_sales) > 1:
        uplift = (event_type_sales[1]/event_type_sales[0] - 1)*100
        print(f"  {col.replace('event_type_', '').title()}: {uplift:.1f}% uplift")

## 7. Handle Missing Values and Data Quality

In [None]:
# Check missing values in engineered features
print("Missing Values Analysis:")
print("=" * 25)

missing_values = sales_with_events.isnull().sum()
missing_pct = (missing_values / len(sales_with_events)) * 100

missing_summary = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_pct
}).sort_values('Missing_Percentage', ascending=False)

# Show only columns with missing values
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0]
print(missing_summary)

In [None]:
# Handle missing values in lag and rolling features
print("Handling missing values...")

# For lag features, forward fill within each item group
lag_cols = [col for col in sales_with_events.columns if col.startswith('lag_')]
rolling_cols = [col for col in sales_with_events.columns if col.startswith('rolling_')]

for col in lag_cols + rolling_cols:
    sales_with_events[col] = sales_with_events.groupby('id')[col].fillna(method='bfill')
    sales_with_events[col] = sales_with_events.groupby('id')[col].fillna(0)

# For price features, use forward fill and backward fill
price_feature_cols = ['sell_price', 'price_change', 'price_change_pct', 'price_momentum_7']
for col in price_feature_cols:
    if col in sales_with_events.columns:
        sales_with_events[col] = sales_with_events.groupby(['store_id', 'item_id'])[col].fillna(method='ffill')
        sales_with_events[col] = sales_with_events.groupby(['store_id', 'item_id'])[col].fillna(method='bfill')
        sales_with_events[col] = sales_with_events[col].fillna(sales_with_events[col].median())

print("Missing values handled.")

# Check remaining missing values
remaining_missing = sales_with_events.isnull().sum().sum()
print(f"Remaining missing values: {remaining_missing}")

## 8. Feature Selection and Engineering Summary

In [None]:
# Summarize engineered features
print("FEATURE ENGINEERING SUMMARY")
print("=" * 40)

print(f"Final dataset shape: {sales_with_events.shape}")
print(f"Original sales dataset shape: {sales.shape}")
print(f"Features added: {sales_with_events.shape[1] - len(sales.columns)}")

# Categorize features
feature_categories = {
    'Original': ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'sales'],
    'Temporal': [col for col in sales_with_events.columns if col in temporal_cols],
    'Lag': [col for col in sales_with_events.columns if col.startswith('lag_')],
    'Rolling': [col for col in sales_with_events.columns if col.startswith('rolling_')],
    'Price': [col for col in sales_with_events.columns if 'price' in col.lower()],
    'Event': [col for col in sales_with_events.columns if 'event' in col.lower() or 'snap' in col.lower()],
    'Calendar': [col for col in sales_with_events.columns if col in calendar.columns and col not in ['d', 'date']]
}

print("\nFeature categories:")
for category, features in feature_categories.items():
    print(f"  {category}: {len(features)} features")
    if len(features) <= 10:
        print(f"    {features}")
    else:
        print(f"    {features[:5]} ... {features[-2:]}")

In [None]:
# Feature importance analysis (correlation with target)
print("\nFeature Correlation with Sales Target:")
print("=" * 40)

# Select numeric features for correlation analysis
numeric_features = sales_with_events.select_dtypes(include=[np.number]).columns
feature_correlations = sales_with_events[numeric_features].corr()['sales'].abs().sort_values(ascending=False)

print("Top 15 features by correlation with sales:")
print(feature_correlations.head(16).iloc[1:])  # Exclude sales itself

# Visualize feature correlations
plt.figure(figsize=(10, 6))
top_features = feature_correlations.head(16).iloc[1:]
plt.barh(range(len(top_features)), top_features.values)
plt.yticks(range(len(top_features)), top_features.index)
plt.xlabel('Absolute Correlation with Sales')
plt.title('Top 15 Features by Correlation with Sales')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 9. Prepare Data for Modeling

In [None]:
# Select features for modeling
modeling_features = [
    # Identifier features
    'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'date',
    # Target
    'sales',
    # Lag features
    'lag_1', 'lag_2', 'lag_3', 'lag_7', 'lag_14', 'lag_28',
    # Rolling features
    'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_28',
    'rolling_std_7', 'rolling_std_14', 'rolling_std_28',
    # Temporal features
    'day_of_week', 'day_of_month', 'month', 'quarter', 'is_weekend',
    # Price features (if available)
    'sell_price', 'price_change_pct',
    # Event features
    'has_any_event', 'snap_any',
    # Calendar features
    'wm_yr_wk', 'wday'
]

# Filter features that actually exist in the dataset
available_features = [feat for feat in modeling_features if feat in sales_with_events.columns]
modeling_data = sales_with_events[available_features].copy()

print(f"Selected {len(available_features)} features for modeling:")
print(available_features)
print(f"\nModeling dataset shape: {modeling_data.shape}")

In [None]:
# Get highest selling product for focused modeling
highest_selling_product = preprocessor.get_highest_selling_product(sales)
print(f"Highest selling product: {highest_selling_product}")

# Prepare time series data for this product
product_time_series = preprocessor.prepare_time_series_data(modeling_data, highest_selling_product)
print(f"Time series data shape: {product_time_series.shape}")
print(f"Date range: {product_time_series.index.min()} to {product_time_series.index.max()}")

# Display sample of time series data
print("\nSample time series data:")
print(product_time_series.head(10))

## 10. Save Processed Data

In [None]:
# Save processed data
processed_data_path = config.get('data.processed_data_path', 'data/processed/')
os.makedirs(processed_data_path, exist_ok=True)

print("Saving processed data...")

# Save full processed dataset
modeling_data.to_parquet(f"{processed_data_path}/sales_processed.parquet", index=False)
print(f"Full processed dataset saved to {processed_data_path}/sales_processed.parquet")

# Save time series data for highest selling product
product_time_series.to_csv(f"{processed_data_path}/highest_selling_product_ts.csv")
print(f"Product time series saved to {processed_data_path}/highest_selling_product_ts.csv")

# Save feature engineering metadata
feature_metadata = {
    'highest_selling_product': highest_selling_product,
    'feature_categories': feature_categories,
    'modeling_features': available_features,
    'dataset_shape': modeling_data.shape,
    'feature_correlations': feature_correlations.head(20).to_dict(),
    'processing_config': {
        'lag_features': lag_features,
        'rolling_windows': rolling_windows,
        'temporal_features': temporal_cols
    }
}

import json
with open(f"{processed_data_path}/feature_metadata.json", 'w') as f:
    json.dump(feature_metadata, f, indent=2, default=str)

print(f"Feature metadata saved to {processed_data_path}/feature_metadata.json")

## 11. Feature Engineering Summary

In [None]:
print("FEATURE ENGINEERING COMPLETE!")
print("=" * 40)

print("\n✅ FEATURES CREATED:")
print(f"   • {len(lag_cols)} lag features (1, 2, 3, 7, 14, 28 days)")
print(f"   • {len(rolling_cols)} rolling statistics (7, 14, 28 day windows)")
print(f"   • {len([col for col in sales_with_events.columns if col in temporal_cols])} temporal features")
print(f"   • {len([col for col in sales_with_events.columns if 'price' in col.lower()])} price features")
print(f"   • {len([col for col in sales_with_events.columns if 'event' in col.lower() or 'snap' in col.lower()])} event features")

print("\n📊 DATA INSIGHTS:")
print(f"   • Event uplift: {(event_sales[1]/event_sales[0] - 1)*100:.1f}%")
print(f"   • SNAP benefits uplift: {(snap_sales[1]/snap_sales[0] - 1)*100:.1f}%")
print(f"   • Weekend vs weekday sales: {(weekend_sales[1]/weekend_sales[0] - 1)*100:.1f}%")
print(f"   • Price-sales correlation: {price_sales_corr:.3f}")

print("\n💾 FILES SAVED:")
print(f"   • {processed_data_path}/sales_processed.parquet")
print(f"   • {processed_data_path}/highest_selling_product_ts.csv")
print(f"   • {processed_data_path}/feature_metadata.json")

print("\n🎯 READY FOR MODELING:")
print(f"   • Dataset shape: {modeling_data.shape}")
print(f"   • Features selected: {len(available_features)}")
print(f"   • Highest selling product: {highest_selling_product}")
print(f"   • No missing values in key features")

print("\n➡️  Next step: Run 03_model_training.ipynb")