In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Load dataset
df = pd.read_csv("data_final.csv", parse_dates=["datum"])
df = df.sort_values(["datum", "volgnr"]).reset_index(drop=True)

# 2. Calculate monthly average of cash flow, exogenous and weather data
df_grouped = df.groupby("datum").agg({
    "totale_kasstroom": "mean",
    "kas": "mean",
    "leningen.1": "mean",
    "overige_banken": "mean",
    "neerslag_(mm)": "mean",
    "gemiddelde_temperatuur": "mean"
}).reset_index()

# Enforce monthly frequency and fill in missing months by interpolation
df_grouped = df_grouped.set_index("datum").asfreq("MS").interpolate()

# 3. Train/test split (2020–2023 = train, 2024 = test)
train = df_grouped[df_grouped.index.year < 2024]
test = df_grouped[df_grouped.index.year == 2024]

# 4. Rolling forecast
history_y = train["totale_kasstroom"].copy()
history_exog = train[["kas", "leningen.1", "overige_banken", "neerslag_(mm)", "gemiddelde_temperatuur"]].copy()

predictions = []
true_values = []

for i in range(len(test)):
    test_y = test["totale_kasstroom"].iloc[i]
    test_x = test[["kas", "leningen.1", "overige_banken", "neerslag_(mm)", "gemiddelde_temperatuur"]].iloc[[i]]

    model = SARIMAX(
        history_y,
        exog=history_exog,
        order=(1, 1, 1),
        seasonal_order=(1, 1, 1, 12),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    model_fit = model.fit(disp=False)

    pred = model_fit.forecast(steps=1, exog=test_x)
    predictions.append(pred.iloc[0])
    true_values.append(test_y)

    # Add current test month to historical data
    history_y = pd.concat([history_y, pd.Series([test_y], index=[test.index[i]])])
    history_exog = pd.concat([history_exog, test_x])

# 5. Evaluation
rmse = mean_squared_error(true_values, predictions, squared=False)
mae = mean_absolute_error(true_values, predictions)
r2 = r2_score(true_values, predictions)

print(f"\nSARIMAX WITH WEATHER DATA – RMSE (rolling, 1-month): {rmse:,.2f} euros")
print(f"SARIMAX WITH WEATHER DATA – MAE (rolling, 1-month): {mae:,.2f} euros")
print(f"SARIMAX WITH WEATHER DATA – R² score (rolling, 1-month): {r2:.3f}")

# 6. Visualization
plt.figure(figsize=(10, 5))
plt.plot(train.index, train["totale_kasstroom"], label="Train (2020–2023)", color="blue")
plt.plot(test.index, true_values, label="Actual (2024)", color="blue")
plt.plot(test.index, predictions, label="Forecast (SARIMAX + weather data)", linestyle="--", color="orangered")
plt.title("SARIMAX Rolling Forecast (1-Month Ahead) – Average Cash Flow (with Weather Data)")
plt.xlabel("Date")
plt.ylabel("Average Cash Flow (€)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# 1. Load dataset and sort
df = pd.read_csv("data_final.csv", parse_dates=["datum"])
df = df.sort_values(["datum", "volgnr"]).reset_index(drop=True)

# 2. Identify unique clusters
clusters = sorted(df['bedrijf_cluster'].dropna().unique())

# Prepare results and plotting storage
all_results = []
plots_data = {}

print("SARIMAX performance per bedrijf_cluster (average cash flow):\n")

# 3. Loop through each cluster
for cl in clusters:
    df_cluster = df[df['bedrijf_cluster'] == cl]

    # 4. Aggregate monthly averages for selected variables
    df_grouped = df_cluster.groupby("datum").agg({
        "totale_kasstroom": "mean",
        "kas": "mean",
        "leningen.1": "mean",
        "overige_banken": "mean"
    }).reset_index()
    df_grouped = df_grouped.set_index("datum").asfreq("MS").interpolate()

    # 5. Train/test split based on year
    train = df_grouped[df_grouped.index.year < 2024]
    test = df_grouped[df_grouped.index.year == 2024]

    history_y = train["totale_kasstroom"].copy()
    history_exog = train[["kas", "leningen.1", "overige_banken"]].copy()

    predictions = []
    true_values = []

    # 6. Rolling forecast with retraining at each step
    for i in range(len(test)):
        test_y = test["totale_kasstroom"].iloc[i]
        test_x = test[["kas", "leningen.1", "overige_banken"]].iloc[[i]]

        model = SARIMAX(
            history_y,
            exog=history_exog,
            order=(1, 1, 1),
            seasonal_order=(1, 1, 1, 12),
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        model_fit = model.fit(disp=False)

        pred = model_fit.forecast(steps=1, exog=test_x)
        predictions.append(pred.iloc[0])
        true_values.append(test_y)

        history_y = pd.concat([history_y, pd.Series([test_y], index=[test.index[i]])])
        history_exog = pd.concat([history_exog, test_x])

    # 7. Evaluation metrics
    rmse = mean_squared_error(true_values, predictions, squared=False)
    r2 = r2_score(true_values, predictions)
    mae = mean_absolute_error(true_values, predictions)

    print(f"Cluster {cl}:")
    print(f"   RMSE: €{rmse:,.2f}")
    print(f"   R²:   {r2:.3f}")
    print(f"   MAE:  €{mae:,.2f}\n")

    all_results.append({
        "Cluster": cl,
        "RMSE": rmse,
        "R²": r2,
        "MAE": mae
    })

    test = test.copy()
    test["prediction"] = predictions
    plots_data[cl] = test

# 8. Visualization of actual vs predicted per cluster
fig, axes = plt.subplots(1, len(clusters), figsize=(18, 5), sharey=True)

for ax, cl in zip(axes, clusters):
    df_plot = plots_data[cl]
    ax.plot(df_plot.index, df_plot["totale_kasstroom"], label="Actual", color="blue", linewidth=2)
    ax.plot(df_plot.index, df_plot["prediction"], label="Forecast (2024)", color="orangered", linestyle="--", linewidth=2)
    ax.axvline(pd.to_datetime("2024-01-01"), color="gray", linestyle=":", label="Forecast Start")
    ax.set_title(f"Cluster {int(cl)}", fontsize=13)
    ax.set_xlabel("Date", fontsize=11)
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.set_xticks(pd.date_range(start="2024-01-01", end="2024-12-01", freq="MS"))
    ax.tick_params(axis='x', rotation=45)

axes[0].set_ylabel("Average Cash Flow (€)", fontsize=12)

# Global legend
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="lower center", bbox_to_anchor=(0.5, -0.12), ncol=3, fontsize=12)
plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Load dataset
df = pd.read_csv("data_final.csv", parse_dates=["datum"])
df = df.sort_values(["datum", "volgnr"]).reset_index(drop=True)

# 2. Calculate monthly average cash flow (no exogenous variables)
df_grouped = df.groupby("datum").agg({
    "totale_kasstroom": "mean",
    "kas": "mean",
    "leningen.1": "mean",
    "overige_banken": "mean"
}).reset_index()

# Enforce monthly frequency and interpolate missing months
df_grouped = df_grouped.set_index("datum").asfreq("MS").interpolate()

# 3. Train/test split (2020–2023 = train, 2024 = test)
train = df_grouped[df_grouped.index.year < 2024]
test = df_grouped[df_grouped.index.year == 2024]

# 4. Rolling forecast (without exogenous variables)
history_y = train["totale_kasstroom"].copy()

predictions = []
true_values = []

for i in range(len(test)):
    test_y = test["totale_kasstroom"].iloc[i]

    model = SARIMAX(
        history_y,
        order=(1, 1, 1),
        seasonal_order=(1, 1, 1, 12),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    model_fit = model.fit(disp=False)

    pred = model_fit.forecast(steps=1)
    predictions.append(pred.iloc[0])
    true_values.append(test_y)

    # Add test month to history
    history_y = pd.concat([history_y, pd.Series([test_y], index=[test.index[i]])])

# 5. Evaluation
rmse = mean_squared_error(true_values, predictions, squared=False)
mae = mean_absolute_error(true_values, predictions)
r2 = r2_score(true_values, predictions)

print(f"\nSARIMAX WITHOUT WEATHER DATA – RMSE (rolling, 1-month): {rmse:,.2f} euros")
print(f"SARIMAX WITHOUT WEATHER DATA – MAE (rolling, 1-month): {mae:,.2f} euros")
print(f"SARIMAX WITHOUT WEATHER DATA – R² score (rolling, 1-month): {r2:.3f}")

# 6. Visualization
plt.figure(figsize=(10, 5))
plt.plot(train.index, train["totale_kasstroom"], label="Train (2020–2023)", color="blue")
plt.plot(test.index, true_values, label="Actual (2024)", color="blue")
plt.plot(test.index, predictions, label="Forecast (SARIMAX without weather data)", linestyle="--", color="orangered")
plt.title("SARIMAX Rolling Forecast (1-Month Ahead) – Average Cash Flow (without Weather Data)")
plt.xlabel("Date")
plt.ylabel("Average Cash Flow (€)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
