In [1]:
# Step3B.1 - Import Libraries

import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Step3B.2 - Load the data

# Load the monthly SKU data including zero-sales months
all_sku_monthly_w0 = pd.read_csv("all_sku_monthly_w0.csv")

# Ensure correct date format
all_sku_monthly_w0['YearMonth'] = pd.to_datetime(all_sku_monthly_w0['YearMonth'])

In [3]:
# Step3B.3 - Define Training and Test Sets

# Define training set
training_data = all_sku_monthly_w0[
    (all_sku_monthly_w0['YearMonth'] >= '2023-01-31') & 
    (all_sku_monthly_w0['YearMonth'] <= '2024-08-31')
]

# Define test set (forecast period)
test_data = all_sku_monthly_w0[
    (all_sku_monthly_w0['YearMonth'] >= '2024-09-30') & 
    (all_sku_monthly_w0['YearMonth'] <= '2025-03-31')
]


In [4]:
# Step3B.4 -Install and Import ARIMA

!pip install pmdarima

from pmdarima import auto_arima




In [5]:
# Step3B.5 - Set up forecasting loop

# Prepare list to store forecasts
arima_forecasts = []

# Get unique SKUs
sku_list = training_data['Item_ID'].unique()

# Forecasting loop
for sku in tqdm(sku_list):
    # Filter the training data for the current SKU
    sku_train_df = training_data[training_data['Item_ID'] == sku][['YearMonth', 'Monthly_Quantity']]

    # Skip SKUs with all zero historical sales
    if sku_train_df['Monthly_Quantity'].sum() == 0:
        continue

    # Ensure datetime index
    sku_train_df['YearMonth'] = pd.to_datetime(sku_train_df['YearMonth'])
    sku_train_df.set_index('YearMonth', inplace=True)

    # Extract the target Series for ARIMA
    y = sku_train_df['Monthly_Quantity']

    # Sanity check
    if y.isnull().any():
        print(f"SKU {sku} error: NaNs detected even after cleaning — skipping")
        continue

    try:
        # Fit ARIMA without seasonality
        model = auto_arima(
            y,
            seasonal=False,
            stepwise=True,
            error_action='ignore',
            suppress_warnings=True
        )

        # Forecast next 7 months
        forecast = model.predict(n_periods=7)

    except Exception as e:
        print(f"SKU {sku} error: {e} — using fallback average")

        # Fallback: use 3-month average if ARIMA fails
        fallback_value = y.tail(3).mean()
        forecast = [fallback_value] * 7

    # Store forecast results
    forecast_df = pd.DataFrame({
        'ForecastMonth': pd.date_range(start='2024-09-30', periods=7, freq='M'),
        'Forecasted_Quantity': forecast,
        'Item_ID': sku
    })

    arima_forecasts.append(forecast_df)

# Combine forecasts into single DataFrame
arima_forecast_df = pd.concat(arima_forecasts, ignore_index=True)

# Fix negative and scientific notation values
arima_forecast_df['Forecasted_Quantity'] = arima_forecast_df['Forecasted_Quantity'].apply(
    lambda x: 0 if x < 0 or abs(x) < 1e-5 else round(x, 2)
)

# Save forecasts to CSV
arima_forecast_df.to_csv("arima_full_forecast.csv", index=False)

# Preview
arima_forecast_df.head()

100%|██████████| 2935/2935 [09:43<00:00,  5.03it/s] 


Unnamed: 0,ForecastMonth,Forecasted_Quantity,Item_ID
0,2024-09-30,5.5,610009
1,2024-10-31,5.5,610009
2,2024-11-30,5.5,610009
3,2024-12-31,5.5,610009
4,2025-01-31,5.5,610009


In [6]:
## Step3B.6 - Compare Forecast with Actuals
actual_df = all_sku_monthly_w0[
    (all_sku_monthly_w0['YearMonth'] >= '2024-09-30') & 
    (all_sku_monthly_w0['YearMonth'] <= '2025-03-31')].copy()

actual_df.rename(columns={'YearMonth': 'ForecastMonth', 'Monthly_Quantity': 'Actual'}, inplace=True)

forecast_df = arima_forecast_df.copy()

# Merge Actuals LEFT JOIN Forecasts
comparison_df = pd.merge(
    actual_df[['ForecastMonth', 'Item_ID', 'Actual']],
    forecast_df[['ForecastMonth', 'Item_ID', 'Forecasted_Quantity']],
    on=['ForecastMonth', 'Item_ID'],
    how='left'  # Keep all actuals
)

# Fill missing Forecasted_Quantity with 0
comparison_df['Forecasted_Quantity'] = comparison_df['Forecasted_Quantity'].fillna(0)

# (Optional) Calculate Forecast_Error
comparison_df['Forecast_Error'] = comparison_df['Forecasted_Quantity'] - comparison_df['Actual']

# Save the full comparison
comparison_df.to_csv("actual_vs_arima_forecasts.csv", index=False)

print("✅ Full Forecast vs Actual (Full Lineup) Saved: 'Actual_vs_arima_forecasts.csv'")


✅ Full Forecast vs Actual (Full Lineup) Saved: 'Actual_vs_arima_forecasts.csv'


In [7]:
## Step3B.7 - Compare Forecast with Actuals
import pandas as pd
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error
)

# Load and prepare forecast comparison file
comparison_df = pd.read_csv("actual_vs_arima_forecasts.csv")
comparison_df['ForecastMonth'] = pd.to_datetime(comparison_df['ForecastMonth'])

# Filter to forecast horizon only
eval_df = comparison_df[comparison_df['ForecastMonth'] >= '2024-09-30']

# Compute metrics
rmse = root_mean_squared_error(eval_df['Actual'], eval_df['Forecasted_Quantity'])
mae = mean_absolute_error(eval_df['Actual'], eval_df['Forecasted_Quantity'])

# Avoid divide-by-zero issue in MAPE by filtering out zero-actual rows
nonzero_eval_df = eval_df[eval_df['Actual'] != 0]
mape = mean_absolute_percentage_error(
    nonzero_eval_df['Actual'],
    nonzero_eval_df['Forecasted_Quantity']
) * 100

# Display results
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.2f}%")

RMSE: 15.25
MAE: 5.92
MAPE: 118.21%
