In [2]:
# M4 Forecasting: Baseline Models (Naive, Seasonal Naive, Moving Average)

# %% [markdown]
# ## 1. Setup & Imports

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

print("Libraries imported!")

# %% [markdown]
# ## 2. Evaluation Metrics

# %%
def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (M4 official metric)
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    
    return 100 * np.mean(diff)

def mase(y_true, y_pred, y_train, seasonality=1):
    """
    Mean Absolute Scaled Error
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_train = np.array(y_train)
    
    mae = np.mean(np.abs(y_true - y_pred))
    
    # Naive forecast MAE on training set
    naive_mae = np.mean(np.abs(y_train[seasonality:] - y_train[:-seasonality]))
    
    if naive_mae == 0:
        return np.nan
    
    return mae / naive_mae

print(" Metrics defined: sMAPE, MASE")

# %% [markdown]
# ## 3. Select Frequency for Analysis

# %%
FREQUENCY = 'Daily'
SEASONALITY = 7  
HORIZON = 14  

data_path = Path('../data/M4')

train_file = data_path / f'{FREQUENCY}-train.csv'
test_file = data_path / f'{FREQUENCY}-test.csv'

print("="*80)
print(f" LOADING {FREQUENCY.upper()} DATA")
print("="*80)

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

print(f"\n Train: {train_df.shape}")
print(f" Test: {test_df.shape}")
print(f" Series: {len(train_df)}")
print(f" Forecast horizon: {HORIZON} days")
print(f" Seasonality: {SEASONALITY} days")

# %% [markdown]
# ## 4. Baseline Model 1: Naive Forecast

# %%
print("\n" + "="*80)
print(" NAIVE FORECAST")
print("="*80)

naive_predictions = []
naive_smape_scores = []
naive_mase_scores = []

for idx in tqdm(range(len(train_df)), desc="Naive Forecast"):
    # Get train series (drop ID column, remove NaN)
    series_id = train_df.iloc[idx, 0]
    train_series = train_df.iloc[idx, 1:].dropna().values
    test_series = test_df.iloc[idx, 1:].dropna().values
    
    if len(train_series) == 0:
        continue
    
    # Naive: repeat last value
    last_value = train_series[-1]
    forecast = np.full(len(test_series), last_value)
    
    # Metrics
    smape_score = smape(test_series, forecast)
    mase_score = mase(test_series, forecast, train_series, seasonality=1)
    
    naive_smape_scores.append(smape_score)
    if not np.isnan(mase_score):
        naive_mase_scores.append(mase_score)
    
    naive_predictions.append({
        'series_id': series_id,
        'forecast': forecast
    })

print(f"\n Naive Forecast Results:")
print(f"   sMAPE: {np.mean(naive_smape_scores):.4f}")
print(f"   MASE:  {np.mean(naive_mase_scores):.4f}")

# %% [markdown]
# ## 5. Baseline Model 2: Seasonal Naive

# %%
print("\n" + "="*80)
print(f" SEASONAL NAIVE (Lag {SEASONALITY} days)")
print("="*80)

seasonal_predictions = []
seasonal_smape_scores = []
seasonal_mase_scores = []

for idx in tqdm(range(len(train_df)), desc="Seasonal Naive"):
    series_id = train_df.iloc[idx, 0]
    train_series = train_df.iloc[idx, 1:].dropna().values
    test_series = test_df.iloc[idx, 1:].dropna().values
    
    if len(train_series) < SEASONALITY:
        # Fallback to naive if series too short
        forecast = np.full(len(test_series), train_series[-1])
    else:
        # Seasonal naive: repeat last seasonal cycle
        forecast = []
        for h in range(len(test_series)):
            lag_idx = len(train_series) - SEASONALITY + (h % SEASONALITY)
            if lag_idx >= 0 and lag_idx < len(train_series):
                forecast.append(train_series[lag_idx])
            else:
                forecast.append(train_series[-1])
        forecast = np.array(forecast)
    
    # Metrics (MASE with seasonality=1 as per M4 official benchmark)
    smape_score = smape(test_series, forecast)
    mase_score = mase(test_series, forecast, train_series, seasonality=1)
    
    seasonal_smape_scores.append(smape_score)
    if not np.isnan(mase_score):
        seasonal_mase_scores.append(mase_score)
    
    seasonal_predictions.append({
        'series_id': series_id,
        'forecast': forecast
    })

print(f"\n Seasonal Naive Results:")
print(f"   sMAPE: {np.mean(seasonal_smape_scores):.4f}")
print(f"   MASE:  {np.mean(seasonal_mase_scores):.4f}")

# %% [markdown]
# ## 6. Baseline Model 3: Moving Average

# %%
print("\n" + "="*80)
print(" MOVING AVERAGE (Window = 7 days)")
print("="*80)

MA_WINDOW = 7  

ma_predictions = []
ma_smape_scores = []
ma_mase_scores = []

for idx in tqdm(range(len(train_df)), desc="Moving Average"):
    series_id = train_df.iloc[idx, 0]
    train_series = train_df.iloc[idx, 1:].dropna().values
    test_series = test_df.iloc[idx, 1:].dropna().values
    
    if len(train_series) < MA_WINDOW:
        # Fallback to naive
        forecast = np.full(len(test_series), train_series[-1])
    else:
        # Moving average: mean of last MA_WINDOW values
        ma_value = np.mean(train_series[-MA_WINDOW:])
        forecast = np.full(len(test_series), ma_value)
    
    # Metrics
    smape_score = smape(test_series, forecast)
    mase_score = mase(test_series, forecast, train_series, seasonality=1)
    
    ma_smape_scores.append(smape_score)
    if not np.isnan(mase_score):
        ma_mase_scores.append(mase_score)
    
    ma_predictions.append({
        'series_id': series_id,
        'forecast': forecast
    })

print(f"\n Moving Average Results:")
print(f"   sMAPE: {np.mean(ma_smape_scores):.4f}")
print(f"   MASE:  {np.mean(ma_mase_scores):.4f}")

# %% [markdown]
# ## 7. Results Comparison

# %%

results_df = pd.DataFrame([
    {
        'Model': 'Naive (Last Value)',
        'sMAPE': np.mean(naive_smape_scores),
        'MASE': np.mean(naive_mase_scores)
    },
    {
        'Model': f'Seasonal Naive (Lag {SEASONALITY})',
        'sMAPE': np.mean(seasonal_smape_scores),
        'MASE': np.mean(seasonal_mase_scores)
    },
    {
        'Model': f'Moving Average (w={MA_WINDOW})',
        'sMAPE': np.mean(ma_smape_scores),
        'MASE': np.mean(ma_mase_scores)
    }
]).sort_values('sMAPE')

print("\n" + "="*80)
print(" BASELINE MODELS COMPARISON")
print("="*80)
print(results_df.to_string(index=False))

Libraries imported!
 Metrics defined: sMAPE, MASE
 LOADING DAILY DATA

 Train: (4227, 9920)
 Test: (4227, 15)
 Series: 4227
 Forecast horizon: 14 days
 Seasonality: 7 days

 NAIVE FORECAST


Naive Forecast:   0%|          | 0/4227 [00:00<?, ?it/s]


 Naive Forecast Results:
   sMAPE: 3.0453
   MASE:  3.2784

 SEASONAL NAIVE (Lag 7 days)


Seasonal Naive:   0%|          | 0/4227 [00:00<?, ?it/s]


 Seasonal Naive Results:
   sMAPE: 3.7419
   MASE:  4.1282

 MOVING AVERAGE (Window = 7 days)


Moving Average:   0%|          | 0/4227 [00:00<?, ?it/s]


 Moving Average Results:
   sMAPE: 3.5593
   MASE:  3.9293

 BASELINE MODELS COMPARISON
                 Model    sMAPE     MASE
    Naive (Last Value) 3.045252 3.278424
  Moving Average (w=7) 3.559255 3.929271
Seasonal Naive (Lag 7) 3.741925 4.128193
