# AWS Retail Sales Forecasting - Model Training Notebook

This notebook demonstrates training machine learning models for sales forecasting using XGBoost and Prophet.

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from prophet import Prophet
import matplotlib.pyplot as plt
import boto3
import joblib

## Load and Preprocess Data

In [None]:
# Load data (assuming local CSV for demo)
df = pd.read_csv('../sales_data.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['product_id', 'store_id', 'date'])

# Basic preprocessing
df = df.dropna()
print(df.head())

## Feature Engineering

In [None]:
# Create lag features
for lag in [1, 7, 14]:
    df[f'sales_lag{lag}'] = df.groupby(['product_id', 'store_id'])['sales_quantity'].shift(lag)

# Rolling statistics
df['sales_rolling_7d'] = df.groupby(['product_id', 'store_id'])['sales_quantity'].rolling(7).mean().reset_index(0, drop=True)
df['sales_rolling_30d'] = df.groupby(['product_id', 'store_id'])['sales_quantity'].rolling(30).mean().reset_index(0, drop=True)

# One-hot encode categorical
df = pd.get_dummies(df, columns=['product_id', 'store_id'])

# Drop rows with NaN from lags
df = df.dropna()
print(df.columns)

## Train XGBoost Model

In [None]:
# Prepare features and target
features = [col for col in df.columns if col not in ['date', 'sales_quantity']]
X = df[features]
y = df['sales_quantity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'XGBoost MAE: {mae:.2f}, RMSE: {rmse:.2f}')

## Train Prophet Model

In [None]:
# Prepare data for Prophet (example for one product-store)
sample_df = df[(df['product_id_A'] == 1) & (df['store_id_Store1'] == 1)][['date', 'sales_quantity']]
sample_df.columns = ['ds', 'y']

# Train Prophet
prophet_model = Prophet(weekly_seasonality=True)
prophet_model.fit(sample_df)

# Forecast
future = prophet_model.make_future_dataframe(periods=30)
forecast = prophet_model.predict(future)
print(forecast[['ds', 'yhat']].tail())

## Model Evaluation and Comparison

In [None]:
# Compare models (simplified)
# For XGBoost
print(f'XGBoost Performance: MAE={mae:.2f}, RMSE={rmse:.2f}')

# For Prophet (on sample)
actual = sample_df['y'].tail(30)
predicted = forecast['yhat'].tail(30)
prophet_mae = mean_absolute_error(actual, predicted)
print(f'Prophet Performance: MAE={prophet_mae:.2f}')

# Plot
plt.figure(figsize=(10, 6))
plt.plot(sample_df['ds'], sample_df['y'], label='Actual')
plt.plot(forecast['ds'], forecast['yhat'], label='Predicted')
plt.legend()
plt.show()

## Save Models

In [None]:
# Save XGBoost model
joblib.dump(model, '../models/xgboost_model.joblib')

# Save Prophet model
prophet_model.save('../models/prophet_model.json')

print("Models saved successfully!")