# M5 Walmart Sales Forecasting - Data Exploration

This notebook performs comprehensive exploratory data analysis (EDA) on the M5 Walmart Sales dataset.

## Dataset Overview

The M5 dataset contains:
- **calendar.csv**: Date information with events and holidays
- **sales_train_validation.csv**: Historical daily unit sales data
- **sell_prices.csv**: Price information per store and date
- **sample_submission.csv**: Sample submission format

In [None]:
# Import necessary libraries
import sys
import os

# Add src to path for imports
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from src.data.data_loader import M5DataLoader, DataValidator
from src.visualization.plots import M5Visualizer
from src.utils.config import get_config
from src.utils.logger import setup_logger

# Setup
warnings.filterwarnings('ignore')
logger = setup_logger('eda_notebook')
config = get_config()

# Initialize visualizer
visualizer = M5Visualizer()

print("Libraries imported successfully!")

## 1. Data Loading and Basic Information

In [None]:
# Load data using the data loader
data_path = config.get('data.raw_data_path', 'data/raw/')
loader = M5DataLoader(data_path)

try:
    calendar, sales, prices = loader.load_all_data()
    print("Data loaded successfully!")
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please ensure data files are in the correct directory")

In [None]:
# Get data information
data_info = loader.get_data_info()
print("Dataset Information:")
print("=" * 50)

for dataset_name, info in data_info.items():
    print(f"\n{dataset_name.upper()}:")
    print(f"  Shape: {info['shape']}")
    
    if 'date_range' in info:
        print(f"  Date Range: {info['date_range'][0]} to {info['date_range'][1]}")
    if 'num_items' in info:
        print(f"  Number of Items: {info['num_items']}")
        print(f"  Number of Stores: {info['num_stores']}")
        print(f"  Number of Categories: {info['num_categories']}")
    if 'price_range' in info:
        print(f"  Price Range: ${info['price_range'][0]:.2f} - ${info['price_range'][1]:.2f}")

In [None]:
# Validate data
validator = DataValidator()

print("Data Validation Results:")
print("=" * 30)
print(f"Sales data valid: {validator.validate_sales_data(sales)}")
print(f"Calendar data valid: {validator.validate_calendar_data(calendar)}")
print(f"Prices data valid: {validator.validate_prices_data(prices)}")

## 2. Calendar Data Analysis

In [None]:
# Display calendar data structure
print("Calendar Data Sample:")
print(calendar.head())
print(f"\nCalendar Data Shape: {calendar.shape}")
print(f"Date Range: {calendar['date'].min()} to {calendar['date'].max()}")

In [None]:
# Analyze events
print("Event Analysis:")
print("=" * 20)

# Count events
events_1 = calendar['event_name_1'].value_counts().head(10)
events_2 = calendar['event_name_2'].value_counts().head(10)

print("Top 10 Primary Events:")
print(events_1)

print("\nTop 10 Secondary Events:")
print(events_2)

# Event types
print("\nEvent Types:")
event_types = calendar['event_type_1'].value_counts()
print(event_types)

In [None]:
# SNAP analysis
print("SNAP Benefits Analysis:")
print("=" * 25)

snap_summary = calendar[['snap_CA', 'snap_TX', 'snap_WI']].sum()
print("Days with SNAP benefits by state:")
print(snap_summary)

# Percentage of days with SNAP
snap_pct = snap_summary / len(calendar) * 100
print("\nPercentage of days with SNAP benefits:")
print(snap_pct.round(2))

## 3. Sales Data Analysis

In [None]:
# Display sales data structure
print("Sales Data Sample:")
print(sales.head())
print(f"\nSales Data Shape: {sales.shape}")

# Get day columns
day_columns = [col for col in sales.columns if col.startswith('d_')]
print(f"Number of days: {len(day_columns)}")
print(f"First day: {day_columns[0]}")
print(f"Last day: {day_columns[-1]}")

In [None]:
# Basic sales statistics
print("Sales Statistics:")
print("=" * 20)

# Calculate total sales per item
total_sales = sales[day_columns].sum(axis=1)
print(f"Total sales across all items and days: {total_sales.sum():,}")
print(f"Average total sales per item: {total_sales.mean():.2f}")
print(f"Median total sales per item: {total_sales.median():.2f}")
print(f"Max total sales for a single item: {total_sales.max():,}")
print(f"Min total sales for a single item: {total_sales.min():,}")

# Zero sales analysis
zero_sales = (sales[day_columns] == 0).sum().sum()
total_observations = len(sales) * len(day_columns)
zero_pct = zero_sales / total_observations * 100
print(f"\nZero sales observations: {zero_sales:,} ({zero_pct:.2f}%)")

In [None]:
# Sales by category
print("Sales by Category:")
print("=" * 20)

category_sales = sales.groupby('cat_id')[day_columns].sum().sum(axis=1).sort_values(ascending=False)
print(category_sales)

# Sales by department
print("\nTop 10 Departments by Sales:")
dept_sales = sales.groupby('dept_id')[day_columns].sum().sum(axis=1).sort_values(ascending=False).head(10)
print(dept_sales)

In [None]:
# Sales by store and state
print("Sales by Store:")
print("=" * 15)

store_sales = sales.groupby('store_id')[day_columns].sum().sum(axis=1).sort_values(ascending=False)
print(store_sales)

print("\nSales by State:")
state_sales = sales.groupby('state_id')[day_columns].sum().sum(axis=1).sort_values(ascending=False)
print(state_sales)

## 4. Price Data Analysis

In [None]:
# Display price data structure
print("Price Data Sample:")
print(prices.head())
print(f"\nPrice Data Shape: {prices.shape}")

# Basic price statistics
print("\nPrice Statistics:")
print("=" * 20)
print(f"Average price: ${prices['sell_price'].mean():.2f}")
print(f"Median price: ${prices['sell_price'].median():.2f}")
print(f"Min price: ${prices['sell_price'].min():.2f}")
print(f"Max price: ${prices['sell_price'].max():.2f}")
print(f"Standard deviation: ${prices['sell_price'].std():.2f}")

In [None]:
# Price coverage analysis
print("Price Coverage Analysis:")
print("=" * 25)

# Number of unique items with prices
unique_items_with_prices = prices['item_id'].nunique()
total_unique_items = sales['item_id'].nunique()

print(f"Items with price data: {unique_items_with_prices}")
print(f"Total items in sales data: {total_unique_items}")
print(f"Price coverage: {unique_items_with_prices/total_unique_items*100:.2f}%")

# Price data by store
print("\nPrice observations by store:")
store_price_counts = prices['store_id'].value_counts().sort_index()
print(store_price_counts)

## 5. Data Visualization

In [None]:
# Plot sales trends for sample items
try:
    visualizer.plot_sales_trends(sales, num_samples=3)
except Exception as e:
    print(f"Error creating sales trends plot: {e}")

In [None]:
# Plot category performance
try:
    visualizer.plot_category_performance(sales, top_n=10)
except Exception as e:
    print(f"Error creating category performance plot: {e}")

In [None]:
# Plot store performance
try:
    visualizer.plot_store_performance(sales)
except Exception as e:
    print(f"Error creating store performance plot: {e}")

In [None]:
# Plot price analysis
try:
    visualizer.plot_price_analysis(prices)
except Exception as e:
    print(f"Error creating price analysis plot: {e}")

In [None]:
# Plot rolling average sales
try:
    visualizer.plot_rolling_average(sales, calendar, prices, window=90)
except Exception as e:
    print(f"Error creating rolling average plot: {e}")

## 6. Data Quality Assessment

In [None]:
# Missing data analysis
print("Missing Data Analysis:")
print("=" * 25)

print("Calendar missing values:")
calendar_missing = calendar.isnull().sum()
print(calendar_missing[calendar_missing > 0])

print("\nSales missing values:")
sales_missing = sales.isnull().sum()
print(sales_missing[sales_missing > 0])

print("\nPrices missing values:")
prices_missing = prices.isnull().sum()
print(prices_missing[prices_missing > 0])

In [None]:
# Data consistency checks
print("Data Consistency Checks:")
print("=" * 25)

# Check if all days in calendar are present in sales data
calendar_days = set(calendar['d'])
sales_days = set(day_columns)

missing_in_sales = calendar_days - sales_days
missing_in_calendar = sales_days - calendar_days

print(f"Days in calendar but not in sales: {len(missing_in_sales)}")
print(f"Days in sales but not in calendar: {len(missing_in_calendar)}")

# Check item consistency between sales and prices
sales_items = set(sales['item_id'])
price_items = set(prices['item_id'])

items_without_prices = sales_items - price_items
price_items_not_in_sales = price_items - sales_items

print(f"\nItems in sales but not in prices: {len(items_without_prices)}")
print(f"Items in prices but not in sales: {len(price_items_not_in_sales)}")

## 7. Summary and Key Insights

In [None]:
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 50)

print("\n1. DATASET OVERVIEW:")
print(f"   • Sales data covers {len(day_columns)} days")
print(f"   • {sales['item_id'].nunique()} unique items across {sales['store_id'].nunique()} stores")
print(f"   • {sales['cat_id'].nunique()} product categories and {sales['dept_id'].nunique()} departments")
print(f"   • {sales['state_id'].nunique()} states: {', '.join(sales['state_id'].unique())}")

print("\n2. SALES PATTERNS:")
print(f"   • Total sales volume: {total_sales.sum():,} units")
print(f"   • {zero_pct:.1f}% of observations are zero sales (stockouts or no demand)")
print(f"   • Top category: {category_sales.index[0]} ({category_sales.iloc[0]:,} units)")
print(f"   • Top state: {state_sales.index[0]} ({state_sales.iloc[0]:,} units)")

print("\n3. PRICING INFORMATION:")
print(f"   • Price range: ${prices['sell_price'].min():.2f} - ${prices['sell_price'].max():.2f}")
print(f"   • Average price: ${prices['sell_price'].mean():.2f}")
print(f"   • Price coverage: {unique_items_with_prices/total_unique_items*100:.1f}% of items have price data")

print("\n4. EVENTS AND SEASONALITY:")
print(f"   • {calendar['event_name_1'].notna().sum()} days with primary events")
print(f"   • {calendar['event_name_2'].notna().sum()} days with secondary events")
print(f"   • Main event types: {', '.join(event_types.head(3).index)}")

print("\n5. DATA QUALITY:")
quality_score = 100
if len(missing_in_sales) > 0 or len(missing_in_calendar) > 0:
    quality_score -= 10
if len(items_without_prices) > len(sales_items) * 0.1:
    quality_score -= 10
if zero_pct > 50:
    quality_score -= 10

print(f"   • Overall data quality score: {quality_score}/100")
print(f"   • Calendar-sales alignment: {'✓' if len(missing_in_sales) == 0 else '✗'}")
print(f"   • Price data coverage: {'✓' if len(items_without_prices) < len(sales_items) * 0.1 else '✗'}")

print("\n6. RECOMMENDATIONS FOR MODELING:")
print("   • Handle zero sales appropriately (intermittent demand)")
print("   • Consider hierarchical forecasting (item/store/category levels)")
print("   • Incorporate price and event information as features")
print("   • Account for seasonality and trends in the data")
print("   • Use last 28-56 days for validation/testing")

## 8. Save Processed Data

Save key insights and summary statistics for use in subsequent notebooks.

In [None]:
# Create summary statistics dictionary
eda_summary = {
    'dataset_info': {
        'num_items': sales['item_id'].nunique(),
        'num_stores': sales['store_id'].nunique(),
        'num_categories': sales['cat_id'].nunique(),
        'num_departments': sales['dept_id'].nunique(),
        'num_days': len(day_columns),
        'states': sales['state_id'].unique().tolist()
    },
    'sales_stats': {
        'total_sales': int(total_sales.sum()),
        'avg_sales_per_item': float(total_sales.mean()),
        'zero_sales_percentage': float(zero_pct),
        'top_category': category_sales.index[0],
        'top_state': state_sales.index[0]
    },
    'price_stats': {
        'avg_price': float(prices['sell_price'].mean()),
        'min_price': float(prices['sell_price'].min()),
        'max_price': float(prices['sell_price'].max()),
        'price_coverage': float(unique_items_with_prices/total_unique_items)
    },
    'data_quality': {
        'calendar_sales_alignment': len(missing_in_sales) == 0,
        'price_coverage_adequate': len(items_without_prices) < len(sales_items) * 0.1,
        'quality_score': quality_score
    }
}

# Save to JSON for easy loading
import json
output_dir = config.get('output.reports_path', 'output/reports/')
os.makedirs(output_dir, exist_ok=True)

with open(f"{output_dir}/eda_summary.json", 'w') as f:
    json.dump(eda_summary, f, indent=2)

print("EDA summary saved to output/reports/eda_summary.json")
print("\nExploratory Data Analysis Complete!")