<a href="https://colab.research.google.com/github/MiltosTsir/supply-chain-analysis-portfolio/blob/main/portfolio/01-demand-forecasting/notebooks/01_eda_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Setup (install + imports) ---
!pip -q install statsmodels

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

pd.set_option("display.max_columns", None)


In [None]:
# --- Load data from GitHub RAW URL ---
RAW_URL =  https://raw.githubusercontent.com/MiltosTsir/supply-chain-analysis-portfolio/refs/heads/main/portfolio/01-demand-forecasting/data/raw/mock_kaggle.csv

df = pd.read_csv(RAW_URL, parse_dates=["date"])
df = df.sort_values("date").reset_index(drop=True)

print("Shape:", df.shape)
df.head()


In [None]:
# --- Basic EDA ---
display(df.info())
display(df.describe(include="all"))

# Missing values
nulls = df.isna().sum().sort_values(ascending=False)
print("Missing values:\n", nulls[nulls>0])


In [None]:
# --- Sales over time (total) ---
daily = df.groupby("date", as_index=True)["sales"].sum()
plt.figure(figsize=(12,5))
daily.plot()
plt.title("Total Sales Over Time")
plt.ylabel("Units")
plt.xlabel("Date")
plt.show()

# --- by store ( ---
if "store" in df.columns:
    tmp = df.groupby(["date","store"])["sales"].sum().reset_index()
    #  top-5 stores
    top_stores = (tmp.groupby("store")["sales"].mean().nlargest(5).index)
    plt.figure(figsize=(12,5))
    for s in top_stores:
        series = tmp.loc[tmp["store"]==s].set_index("date")["sales"]
        series.resample("D").sum().plot(label=f"store {s}")
    plt.title("Sales by Top Stores")
    plt.legend()
    plt.show()


In [None]:
# daily series with explicit daily frequency
ts = daily.asfreq("D").fillna(0)

# αν έχεις ημερήσια seasonality, ξεκίνα με period=7 (εβδομαδιαία)
result = seasonal_decompose(ts, model="additive", period=7)
fig = result.plot()
fig.set_size_inches(12, 8)
plt.show()


In [None]:
# --- Train/Test split (τελευταίες 60 μέρες για test) ---
horizon = 60 if len(ts) > 120 else max(1, len(ts)//5)
train, test = ts.iloc[:-horizon], ts.iloc[-horizon:]

# Naive: πρόβλεψη = χθεσινή τιμή
naive_forecast = ts.shift(1).iloc[-horizon:]

# Moving Average (7-ημερών), shifted ώστε να μην "κοιτάει" το μέλλον
ma7 = ts.rolling(7).mean().shift(1).iloc[-horizon:]

def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denom = np.where(y_true==0, 1, y_true)
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100

print(f"Naive MAPE: {mape(test, naive_forecast):.2f}%")
print(f"MA(7)  MAPE: {mape(test, ma7):.2f}%")

# Plot last 120 days
plt.figure(figsize=(12,5))
ts.iloc[-120:].plot(label="Actual")
naive_forecast.iloc[-120:].plot(label="Naive")
ma7.iloc[-120:].plot(label="MA(7)")
plt.title("Actual vs Baselines (last 120 days)")
plt.legend()
plt.show()
