In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize
from tensorflow.keras.losses import Huber
# 1. Load and sort data
df = pd.read_csv("data_final.csv", parse_dates=["datum"])
df = df.sort_values(["volgnr", "datum"]).reset_index(drop=True)

# 2. Time-based features
df["jaar"] = df["datum"].dt.year
df["maand"] = df["datum"].dt.month

# 3. Define target: next month's cash flow
df["target"] = df.groupby("volgnr")["totale_kasstroom"].shift(-1)
df = df.dropna(subset=["target"])

# Apply winsorization (same logic as XGBoost model)
df["target"] = pd.Series(winsorize(df["target"], limits=[0.01, 0.01]), index=df.index)

# 4. Add lagged and derived features
df["eindsaldo_liquide_middelen_lag_1"] = df.groupby("volgnr")["eindsaldo_liquide_middelen"].shift(1)
df["mutaties_vorderingen_en_schulden_lag_1"] = df.groupby("volgnr")["mutaties_vorderingen_en_schulden"].shift(1)
df["eindsaldo_liquide_middelen_lag_6"] = df.groupby("volgnr")["eindsaldo_liquide_middelen"].shift(6)
df["mutaties_lag_6"] = df.groupby("volgnr")["mutaties_vorderingen_en_schulden"].shift(6)
df["ratio_schulden_opbrengst"] = df["mutaties_vorderingen_en_schulden"] / (df["totaal_opbrengsten_lag_1"] + 1e-6)
df["kasratio"] = df["kas"] / (df["totale_kasstroom_lag_1"] + 1e-6)
df["melkprijs_diff_6"] = df["melkprijs_per_kg"] - df["melkprijs_per_kg"].shift(6)

# 4b. Fill NaNs in lagged/diff columns per farm
lagged_cols = [col for col in df.columns if any(p in col for p in ["_lag_", "_diff_", "ratio_", "kasratio"])]
df[lagged_cols] = df.groupby("volgnr")[lagged_cols].transform(lambda x: x.bfill().ffill())

# 5. Train/validation/test split by farm
farms = df["volgnr"].unique()
trainval_farms, test_farms = train_test_split(farms, test_size=0.2, random_state=42)
train_farms, val_farms = train_test_split(trainval_farms, test_size=0.2, random_state=42)

df["is_train"] = df["volgnr"].isin(train_farms) & (df["jaar"] < 2024)
df["is_val"] = df["volgnr"].isin(val_farms) & (df["jaar"] < 2024)
df["is_test"] = df["volgnr"].isin(test_farms) & (df["jaar"] == 2024)

# 6. Top 50 most important features according to SHAP values from XGBoost model
NUM_FEATURES_TO_USE = 50
all_features = [f for f in [
    'eindsaldo_liquide_middelen', 'mutaties_vorderingen_en_schulden', 'overige_vorderingen', 'melkprijs_per_kg',
    'crediteuren', 'melkprijs_per_kg_lag_6', 'leningen.1', 'mutatie_crediteuren',
    'resultaat_vóór_bijzondere_resultaten', 'energiekosten', 'neerslag_(mm)', 'totale_kasstroom_lag_1',
    'voorschot_melkgeld', 'melkprijs_per_kg_lag_1', 'maand', 'totaal_opbrengsten_lag_3',
    'volgnr', 'debiteuren', 'grasland', 'accountantskosten',
    'koesaldo_per_kg_fosfaat', 'melkprijs_per_kg_lag_3', 'daadwerkelijke_aflossingen_in_het_jaar',
    'ruwvoeraankopen.1', 'gewasbeschermingsmiddelen', 'overige_mutaties_operationele_activiteiten',
    'krachtvoerkosten_lag_6', 'totale_kosten_excl_afschrijvingen', 'totaal_opbrengsten_lag_6',
    'overige_banken', 'schoonmaakkosten_gebouwen', 'saldo_omzetbelasting',
    'opfokkosten_en_weidegeld_per_100_kg_melk', 'melkkoeien_(€)', 'krachtvoerkosten', 'gebouwen',
    'overige_bedrijfsopbrengsten', 'eiwitgehalte', 'financiële_baten_en_lasten',
    'afschrijving_productierechten', 'totaal_opbrengsten_lag_1', 'resultaat_vóór_belastingen',
    'totale_uitgaven', 'marge', 'aantal_melkkoeien_per_ha', 'voerkosten',
    'gemiddelde_temperatuur', 'boekjaar', 'mutatie_debiteuren', 'totaal_opbrengsten',
    'afschrijving_auto(s)', 'opbrengst_nuka', 'personeelskosten_%_van_de_opbrengsten',
    '%_insteek_van_de_melkkoeien', 'bijzondere_resultaten', 'kas', 'gemiddelde_temperatuur_lag_6',
    'investering_grond_en_gebouwen', 'ureumgehalte', 'waarvan_loonwerk_per_ha',
    'onttrekkingen,_prive_xb9010', 'percentage_jongvee', 'ruwvoerkosten.1',
    'advieskosten', 'pensioenlasten', 'bedrijfskosten', 'gas,_water_en_electra',
    'overige_kosten_inventaris_en_machines', "eindsaldo_liquide_middelen_lag_1",
    "mutaties_vorderingen_en_schulden_lag_1", "eindsaldo_liquide_middelen_lag_6",
    "mutaties_lag_6", "ratio_schulden_opbrengst", "kasratio", "melkprijs_diff_6"
] if f in df.columns]
features = all_features[:NUM_FEATURES_TO_USE]

# 7. Build sequences
sequence_length = 12
X_train, y_train, X_val, y_val, X_test, y_test, sequence_datums = [], [], [], [], [], [], []

def build_sequences(farm_df, flag_col):
    X_seq, y_seq, datums = [], [], []
    for i in range(len(farm_df) - sequence_length):
        input_window = farm_df.iloc[i:i + sequence_length]
        target_row = farm_df.iloc[i + sequence_length]
        if not input_window[flag_col].all() and not target_row[flag_col]:
            continue
        X_seq.append(input_window[features].values)
        y_seq.append(target_row["target"])
        datums.append(target_row["datum"])
    return X_seq, y_seq, datums

for _, farm_df in df.groupby("volgnr"):
    farm_df = farm_df.sort_values("datum")
    x_tr, y_tr, _ = build_sequences(farm_df, "is_train")
    x_va, y_va, _ = build_sequences(farm_df, "is_val")
    x_te, y_te, dts = build_sequences(farm_df, "is_test")
    X_train.extend(x_tr)
    y_train.extend(y_tr)
    X_val.extend(x_va)
    y_val.extend(y_va)
    X_test.extend(x_te)
    y_test.extend(y_te)
    sequence_datums.extend(dts)

X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)
X_test = np.array(X_test)
y_test = np.array(y_test)

# 8. Normalize features and targets
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.reshape(-1, 1))
y_val_scaled = target_scaler.transform(y_val.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.reshape(-1, 1))

feature_scaler = StandardScaler()
X_train_scaled = feature_scaler.fit_transform(X_train.reshape(-1, X_train.shape[2])).reshape(X_train.shape)
X_val_scaled = feature_scaler.transform(X_val.reshape(-1, X_val.shape[2])).reshape(X_val.shape)
X_test_scaled = feature_scaler.transform(X_test.reshape(-1, X_test.shape[2])).reshape(X_test.shape)

# 9. Final shape check
print("X_train:", X_train_scaled.shape)
print("X_val:  ", X_val_scaled.shape)
print("X_test: ", X_test_scaled.shape)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber

# --------------------------
# Model Hyperparameters
# --------------------------
hidden_size = 236
num_layers = 3
dropout = 0.10
batch_size = 32
num_epochs = 10
learning_rate = 0.0006

# --------------------------
# Model Architecture
# --------------------------
model = models.Sequential()
model.add(layers.LSTM(hidden_size, return_sequences=True, input_shape=X_train_scaled.shape[1:]))
model.add(layers.Dropout(dropout))
model.add(layers.LSTM(hidden_size, return_sequences=True))
model.add(layers.Dropout(dropout))
model.add(layers.LSTM(hidden_size))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1))

model.compile(optimizer=Adam(learning_rate=learning_rate), loss=Huber(delta=1.0))
model.summary()

# --------------------------
# Model Training (without early stopping)
# --------------------------
history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_val_scaled, y_val_scaled),
    epochs=num_epochs,
    batch_size=batch_size,
    verbose=1
)

# --------------------------
# Predictions (scaled)
# --------------------------
y_train_pred_scaled = model.predict(X_train_scaled)
y_val_pred_scaled = model.predict(X_val_scaled)
y_test_pred_scaled = model.predict(X_test_scaled)

# Inverse transform predictions to original cash flow (€)
y_train_pred = target_scaler.inverse_transform(y_train_pred_scaled)
y_val_pred = target_scaler.inverse_transform(y_val_pred_scaled)
y_test_pred = target_scaler.inverse_transform(y_test_pred_scaled)

# --------------------------
# Evaluation Function
# --------------------------
def evaluate(y_true, y_pred, label):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    print(f"\n{label} Set Performance:")
    print(f"RMSE: €{rmse:,.2f}")
    print(f"MAE:  €{mae:,.2f}")
    print(f"R²:    {r2:.3f}")
    print(f"MAPE:  {mape:.2f}%")
    return rmse, r2, mae, mape

# Evaluate performance
evaluate(y_train, y_train_pred, "Train")
evaluate(y_val, y_val_pred, "Validation")
evaluate(y_test, y_test_pred, "Test")

# --------------------------
# Scatter Plot: Predicted vs Actual (Test Set)
# --------------------------
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_test_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
plt.xlabel("Actual Cash Flow (€)")
plt.ylabel("Predicted Cash Flow (€)")
plt.title("LSTM: Predicted vs Actual (Test Set)")
plt.grid(True)
plt.tight_layout()
plt.show()

# --------------------------
# Monthly Average Plot (2024)
# --------------------------
df_preds = pd.DataFrame({
    "date": sequence_datums[-len(y_test):],
    "y_true": y_test,
    "y_pred": y_test_pred.flatten()
})
df_preds["month"] = pd.to_datetime(df_preds["date"]).dt.month
monthly_avg = df_preds.groupby("month")[["y_true", "y_pred"]].mean()

plt.figure(figsize=(10, 5))
plt.plot(monthly_avg.index, monthly_avg["y_true"], label="Actual", marker="o")
plt.plot(monthly_avg.index, monthly_avg["y_pred"], label="Predicted", marker="o", linestyle="--")
plt.title("LSTM Forecast – Monthly Average Cash Flow (2024)")
plt.xlabel("Month")
plt.ylabel("Average Cash Flow (€)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Step 1: Collect timestamps for all training sequences (targets)
train_sequence_dates = []
for farm_id, farm_df in df.groupby("volgnr"):
    farm_df = farm_df.sort_values("datum")
    for i in range(len(farm_df) - sequence_length):
        input_window = farm_df.iloc[i:i + sequence_length]
        target_row = farm_df.iloc[i + sequence_length]
        if input_window["is_train"].all():
            train_sequence_dates.append(target_row["datum"])

# Step 2: Construct a combined DataFrame for plotting
# - Training predictions are set to NaN to visually distinguish forecast period
df_plot = pd.DataFrame({
    "date": pd.to_datetime(train_sequence_dates + sequence_datums),
    "y_true": np.concatenate([y_train.flatten(), y_test.flatten()]),
    "y_pred": np.concatenate([
        np.full_like(y_train.flatten(), np.nan),  # no predictions in training period
        y_test_pred.flatten()
    ])
})

# Step 3: Set time index and aggregate by month
df_plot = df_plot.sort_values("date").set_index("date")
monthly_avg = df_plot.resample("M")[["y_true", "y_pred"]].mean()

# Step 4: Plotting monthly averages (2020–2024)
plt.figure(figsize=(12, 5))
plt.plot(monthly_avg.index, monthly_avg["y_true"], label="Actual", color="blue", linewidth=2)
plt.plot(monthly_avg.index, monthly_avg["y_pred"], label="Forecast (2024)", color="orangered", linestyle="--", linewidth=2)

# Optional: Mark start of the forecast period
plt.axvline(pd.to_datetime("2024-01-01"), color="gray", linestyle=":", label="Forecast Start")

# Final plot annotations
plt.title("LSTM Forecast with Exogenous Variables – Monthly Average Cash Flow (2020–2024)")
plt.xlabel("Date")
plt.ylabel("Average Cash Flow (€)")
plt.grid(True, linestyle="--", alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import copy
import tqdm

def permutation_feature_importance(model, X, y_true, scaler, baseline_preds, feature_names, n_repeats=5):
    """
    Computes permutation-based feature importance for a sequence model (e.g., LSTM).
    """
    baseline_rmse = mean_squared_error(y_true, baseline_preds, squared=False)
    importances = []

    for i in tqdm.tqdm(range(X.shape[2]), desc="Calculating feature importance"):
        rmse_diffs = []
        for _ in range(n_repeats):
            X_permuted = copy.deepcopy(X)
            for t in range(X.shape[1]):
                np.random.shuffle(X_permuted[:, t, i])
            y_pred_scaled = model.predict(X_permuted, verbose=0)
            y_pred = scaler.inverse_transform(y_pred_scaled)
            rmse = mean_squared_error(y_true, y_pred, squared=False)
            rmse_diffs.append(rmse - baseline_rmse)
        importances.append(np.mean(rmse_diffs))

    return pd.DataFrame({
        "Feature": feature_names,
        "Importance (RMSE increase)": importances
    }).sort_values(by="Importance (RMSE increase)", ascending=False)
# Baseline predictions (unscaled)
baseline_preds = y_test_pred

# Compute importances
importance_df = permutation_feature_importance(
    model=model,
    X=X_test_scaled,
    y_true=y_test,
    scaler=target_scaler,
    baseline_preds=baseline_preds,
    feature_names=feature_cols,
    n_repeats=3
)

# Plot top 10
top10 = importance_df.head(10)

plt.figure(figsize=(10, 5))
plt.barh(top10["Feature"][::-1], top10["Importance (RMSE increase)"][::-1])
plt.xlabel("Increase in RMSE when permuted")
plt.title("Top 10 Feature Importances (LSTM – Permutation)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Dictionary: Dutch to English feature name mapping
translation_dict = {
    "eindsaldo_liquide_middelen": "ending_cash_balance",
    "totaal_opbrengsten": "total_revenue",
    "debiteuren": "accounts_receivable",
    "melkprijs_per_kg_lag_1": "milk_price_lag_1",
    "totaal_opbrengsten_lag_3": "total_revenue_lag_3",
    "totaal_opbrengsten_lag_6": "total_revenue_lag_6",
    "schoonmaakkosten_gebouwen": "building_cleaning_costs",
    "overige_vorderingen": "other_receivables",
    "overige_banken": "other_banks",
    "koesaldo_per_kg_fosfaat": "cow_balance_per_kg_phosphate",
    "aantal_melkkoeien_per_ha": "dairy_cows_per_hectare",
    "overige_mutaties_operationele_activiteiten": "other_operational_changes",
    "opfokkosten_en_weidegeld_per_100_kg_melk": "raising_and_grazing_costs_per_100kg_milk",
    "overige_bedrijfsopbrengsten": "other_operating_income",
    "totaal_opbrengsten_lag_1": "total_revenue_lag_1"
}

# Apply translation if feature is in dictionary
importance_df["Feature"] = importance_df["Feature"].map(lambda x: translation_dict.get(x, x))

# Plot the top 10 most important features
top10 = importance_df.head(10)

plt.figure(figsize=(10, 5))
plt.barh(top10["Feature"][::-1], top10["Importance (RMSE increase)"][::-1])
plt.xlabel("Increase in RMSE when permuted")
plt.title("Top 10 Feature Importances (LSTM – Permutation)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from scipy.stats.mstats import winsorize

# 1. Load and prepare the dataset
df = pd.read_csv("data_final.csv", parse_dates=["datum"])
df = df.sort_values(["volgnr", "datum"]).reset_index(drop=True)
df["year"] = df["datum"].dt.year
df["month"] = df["datum"].dt.month

# 2. Construct the target variable (next month's cash flow)
df["target"] = df.groupby("volgnr")["totale_kasstroom"].shift(-1)
df = df.dropna(subset=["target"])

# Apply winsorization (same logic as XGBoost model)
df["target"] = pd.Series(winsorize(df["target"], limits=[0.01, 0.01]), index=df.index)

# 3. Create engineered features (lags, ratios, etc.)
df["eindsaldo_liquide_middelen_lag_1"] = df.groupby("volgnr")["eindsaldo_liquide_middelen"].shift(1)
df["mutaties_vorderingen_en_schulden_lag_1"] = df.groupby("volgnr")["mutaties_vorderingen_en_schulden"].shift(1)
df["eindsaldo_liquide_middelen_lag_6"] = df.groupby("volgnr")["eindsaldo_liquide_middelen"].shift(6)
df["mutaties_lag_6"] = df.groupby("volgnr")["mutaties_vorderingen_en_schulden"].shift(6)
df["ratio_schulden_opbrengst"] = df["mutaties_vorderingen_en_schulden"] / (df["totaal_opbrengsten_lag_1"] + 1e-6)
df["kasratio"] = df["kas"] / (df["totale_kasstroom_lag_1"] + 1e-6)
df["melkprijs_diff_6"] = df["melkprijs_per_kg"] - df["melkprijs_per_kg"].shift(6)

# 4. Fill missing values for lagged and derived features
lagged_cols = [col for col in df.columns if any(pat in col for pat in ["_lag_", "_diff_", "ratio_", "kasratio"])]
df[lagged_cols] = df.groupby("volgnr")[lagged_cols].transform(lambda x: x.bfill().ffill())

# 5. Split farms into train, validation, and test sets
boeren = df["volgnr"].unique()
trainval_boeren, test_boeren = train_test_split(boeren, test_size=0.2, random_state=42)
train_boeren, val_boeren = train_test_split(trainval_boeren, test_size=0.2, random_state=42)

df["is_train"] = df["volgnr"].isin(train_boeren) & (df["year"] < 2024)
df["is_val"] = df["volgnr"].isin(val_boeren) & (df["year"] < 2024)
df["is_test"] = df["volgnr"].isin(test_boeren) & (df["year"] == 2024)

# 6. Select top features (replace with your SHAP-ranked list)
top_features = [...]  # <-- your SHAP-based top 50 feature list
feature_cols = [col for col in top_features if col in df.columns][:50]

# 7. Build sequences
sequence_length = 12
X_train, y_train, X_val, y_val, X_test, y_test = [], [], [], [], [], []
sequence_datums, sequence_volgnrs = [], []

def build_sequences(boer_df, flag_col):
    X_seq, y_seq, datums, volgnrs = [], [], [], []
    for i in range(len(boer_df) - sequence_length):
        input_window = boer_df.iloc[i:i + sequence_length]
        target_row = boer_df.iloc[i + sequence_length]
        if not input_window[flag_col].all() and not target_row[flag_col]:
            continue
        X_seq.append(input_window[feature_cols].values)
        y_seq.append(target_row["target"])
        datums.append(target_row["datum"])
        volgnrs.append(target_row["volgnr"])
    return X_seq, y_seq, datums, volgnrs

for _, boer_df in df.groupby("volgnr"):
    boer_df = boer_df.sort_values("datum")
    x_tr, y_tr, _, _ = build_sequences(boer_df, "is_train")
    x_va, y_va, _, _ = build_sequences(boer_df, "is_val")
    x_te, y_te, dts, vols = build_sequences(boer_df, "is_test")
    X_train.extend(x_tr)
    y_train.extend(y_tr)
    X_val.extend(x_va)
    y_val.extend(y_val)
    X_test.extend(x_te)
    y_test.extend(y_te)
    sequence_datums.extend(dts)
    sequence_volgnrs.extend(vols)

X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

# 8. Scaling
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.reshape(-1, 1))
y_val_scaled = target_scaler.transform(y_val.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.reshape(-1, 1))

feature_scaler = StandardScaler()
X_train_scaled = feature_scaler.fit_transform(X_train.reshape(-1, X_train.shape[2])).reshape(X_train.shape)
X_val_scaled = feature_scaler.transform(X_val.reshape(-1, X_val.shape[2])).reshape(X_val.shape)
X_test_scaled = feature_scaler.transform(X_test.reshape(-1, X_test.shape[2])).reshape(X_test.shape)


In [None]:
import matplotlib.pyplot as plt

# Collect training dates again
train_sequence_datums = []
train_sequence_volgnrs = []
for boer_id, boer_df in df.groupby("volgnr"):
    boer_df = boer_df.sort_values("datum")
    for i in range(len(boer_df) - sequence_length):
        input_window = boer_df.iloc[i:i + sequence_length]
        target_row = boer_df.iloc[i + sequence_length]
        if input_window["is_train"].all():
            train_sequence_datums.append(target_row["datum"])
            train_sequence_volgnrs.append(target_row["volgnr"])

# Build full DataFrame
df_plot = pd.DataFrame({
    "datum": pd.to_datetime(train_sequence_datums + sequence_datums),
    "volgnr": np.concatenate([train_sequence_volgnrs, sequence_volgnrs]),
    "y_true": np.concatenate([y_train.flatten(), y_test.flatten()]),
    "y_pred": np.concatenate([np.full_like(y_train.flatten(), np.nan), y_test_pred.flatten()])
})

# Merge clusters
df_clusters = df[["datum", "volgnr", "bedrijf_cluster"]].drop_duplicates()
df_plot = df_plot.merge(df_clusters, on=["datum", "volgnr"], how="left")

# Set date index
df_plot = df_plot.sort_values("datum").set_index("datum")

# Create subplots
unique_clusters = sorted(df_plot["bedrijf_cluster"].dropna().unique())
n_clusters = len(unique_clusters)

fig, axes = plt.subplots(1, n_clusters, figsize=(6 * n_clusters, 5), sharey=True)

for i, cluster in enumerate(unique_clusters):
    ax = axes[i] if n_clusters > 1 else axes
    df_cluster = df_plot[df_plot["bedrijf_cluster"] == cluster]

    # Group by month
    monthly_avg = df_cluster.resample("M")[["y_true", "y_pred"]].mean()

    ax.plot(monthly_avg.index, monthly_avg["y_true"], label="Actual (2020–2024)", color="blue")
    ax.plot(monthly_avg.index, monthly_avg["y_pred"], label="Predicted (2024)", color="orangered", linestyle="--")
    ax.set_title(f"Cluster {int(cluster)}")
    ax.set_xlabel("Date")
    if i == 0:
        ax.set_ylabel("Average Cash Flow (€)")
    ax.grid(True)
    ax.tick_params(axis='x', rotation=45) 
    ax.legend()

plt.suptitle("LSTM Forecast – Monthly Average Cash Flow per Cluster (2020–2024)", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize

# 1. Load and sort data
df = pd.read_csv("data_final.csv", parse_dates=["datum"])
df = df.sort_values(["volgnr", "datum"]).reset_index(drop=True)

# 2. Create time features
df["jaar"] = df["datum"].dt.year
df["maand"] = df["datum"].dt.month

# 3. Create target (next month cash flow)
df["target"] = df.groupby("volgnr")["totale_kasstroom"].shift(-1)
df = df.dropna(subset=["target"])
df["target"] = pd.Series(winsorize(df["target"], limits=[0.01, 0.01]), index=df.index)

# 4. Add engineered features
df["eindsaldo_liquide_middelen_lag_1"] = df.groupby("volgnr")["eindsaldo_liquide_middelen"].shift(1)
df["mutaties_vorderingen_en_schulden_lag_1"] = df.groupby("volgnr")["mutaties_vorderingen_en_schulden"].shift(1)
df["eindsaldo_liquide_middelen_lag_6"] = df.groupby("volgnr")["eindsaldo_liquide_middelen"].shift(6)
df["mutaties_lag_6"] = df.groupby("volgnr")["mutaties_vorderingen_en_schulden"].shift(6)
df["ratio_schulden_opbrengst"] = df["mutaties_vorderingen_en_schulden"] / (df["totaal_opbrengsten_lag_1"] + 1e-6)
df["kasratio"] = df["kas"] / (df["totale_kasstroom_lag_1"] + 1e-6)
df["melkprijs_diff_6"] = df["melkprijs_per_kg"] - df["melkprijs_per_kg"].shift(6)

# 5. Fill engineered features within each farm
lagged_cols = [col for col in df.columns if any(pat in col for pat in ["_lag_", "_diff_", "ratio_", "kasratio"])]
df[lagged_cols] = df.groupby("volgnr")[lagged_cols].transform(lambda x: x.bfill().ffill())

# 6. Train/val/test split
boeren = df["volgnr"].unique()
trainval_boeren, test_boeren = train_test_split(boeren, test_size=0.2, random_state=42)
train_boeren, val_boeren = train_test_split(trainval_boeren, test_size=0.2, random_state=42)

df["is_train"] = df["volgnr"].isin(train_boeren) & (df["jaar"] < 2024)
df["is_val"] = df["volgnr"].isin(val_boeren) & (df["jaar"] < 2024)
df["is_test"] = df["volgnr"].isin(test_boeren) & (df["jaar"] == 2024)

# 7. Select top features (without exogenous variables)
top_features = [
    'eindsaldo_liquide_middelen', 'mutaties_vorderingen_en_schulden', 'overige_vorderingen', 'melkprijs_per_kg',
    'crediteuren', 'melkprijs_per_kg_lag_6', 'leningen.1', 'mutatie_crediteuren',
    'resultaat_vóór_bijzondere_resultaten', 'energiekosten', 'totale_kasstroom_lag_1',
    'voorschot_melkgeld', 'melkprijs_per_kg_lag_1', 'maand', 'totaal_opbrengsten_lag_3',
    'debiteuren', 'grasland', 'accountantskosten', 'koesaldo_per_kg_fosfaat',
    'melkprijs_per_kg_lag_3', 'daadwerkelijke_aflossingen_in_het_jaar', 'ruwvoeraankopen.1',
    'gewasbeschermingsmiddelen', 'overige_mutaties_operationele_activiteiten', 'krachtvoerkosten_lag_6',
    'totale_kosten_excl_afschrijvingen', 'totaal_opbrengsten_lag_6', 'overige_banken',
    'schoonmaakkosten_gebouwen', 'saldo_omzetbelasting', 'opfokkosten_en_weidegeld_per_100_kg_melk',
    'melkkoeien_(€)', 'krachtvoerkosten', 'gebouwen', 'overige_bedrijfsopbrengsten',
    'eiwitgehalte', 'financiële_baten_en_lasten', 'afschrijving_productierechten',
    'totaal_opbrengsten_lag_1', 'resultaat_vóór_belastingen', 'totale_uitgaven', 'marge',
    'aantal_melkkoeien_per_ha', 'voerkosten', 'boekjaar', 'mutatie_debiteuren', 'totaal_opbrengsten',
    'afschrijving_auto(s)', 'opbrengst_nuka', 'personeelskosten_%_van_de_opbrengsten',
    '%_insteek_van_de_melkkoeien', 'bijzondere_resultaten', 'kas',
    "eindsaldo_liquide_middelen_lag_1", "mutaties_vorderingen_en_schulden_lag_1",
    "eindsaldo_liquide_middelen_lag_6", "mutaties_lag_6", "ratio_schulden_opbrengst",
    "kasratio", "melkprijs_diff_6"
]
NUM_FEATURES_TO_USE = 50
feature_cols = [f for f in top_features if f in df.columns][:NUM_FEATURES_TO_USE]

# 8. Sequence building
sequence_length = 12
X_train, y_train, X_val, y_val, X_test, y_test, sequence_datums = [], [], [], [], [], [], []

def build_sequences(boer_df, flag_col):
    X_seq, y_seq, datums = [], [], []
    for i in range(len(boer_df) - sequence_length):
        input_window = boer_df.iloc[i:i + sequence_length]
        target_row = boer_df.iloc[i + sequence_length]
        if not input_window[flag_col].all() and not target_row[flag_col]:
            continue
        X_seq.append(input_window[feature_cols].values)
        y_seq.append(target_row["target"])
        datums.append(target_row["datum"])
    return X_seq, y_seq, datums

for _, boer_df in df.groupby("volgnr"):
    boer_df = boer_df.sort_values("datum")
    x_tr, y_tr, _ = build_sequences(boer_df, "is_train")
    x_va, y_va, _ = build_sequences(boer_df, "is_val")
    x_te, y_te, dts = build_sequences(boer_df, "is_test")
    X_train.extend(x_tr)
    y_train.extend(y_tr)
    X_val.extend(x_va)
    y_val.extend(y_va)
    X_test.extend(x_te)
    y_test.extend(y_te)
    sequence_datums.extend(dts)

X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

# 9. Scaling
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.reshape(-1, 1))
y_val_scaled = target_scaler.transform(y_val.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.reshape(-1, 1))

feature_scaler = StandardScaler()
X_train_scaled = feature_scaler.fit_transform(X_train.reshape(-1, X_train.shape[2])).reshape(X_train.shape)
X_val_scaled = feature_scaler.transform(X_val.reshape(-1, X_val.shape[2])).reshape(X_val.shape)
X_test_scaled = feature_scaler.transform(X_test.reshape(-1, X_test.shape[2])).reshape(X_test.shape)

# 10. Check shapes
print("X_train:", X_train_scaled.shape)
print("X_val:", X_val_scaled.shape)
print("X_test:", X_test_scaled.shape)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber

# --------------------------
# Model Hyperparameters
# --------------------------
hidden_size = 236
num_layers = 3
dropout = 0.10
batch_size = 32
num_epochs = 10
learning_rate = 0.0006

# --------------------------
# Model Architecture
# --------------------------
model = models.Sequential()
model.add(layers.LSTM(hidden_size, return_sequences=True, input_shape=X_train_scaled.shape[1:]))
model.add(layers.Dropout(dropout))
model.add(layers.LSTM(hidden_size, return_sequences=True))
model.add(layers.Dropout(dropout))
model.add(layers.LSTM(hidden_size))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1))

model.compile(optimizer=Adam(learning_rate=learning_rate), loss=Huber(delta=1.0))
model.summary()

# --------------------------
# Model Training (without early stopping)
# --------------------------
history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_val_scaled, y_val_scaled),
    epochs=num_epochs,
    batch_size=batch_size,
    verbose=1
)

# --------------------------
# Predictions (scaled)
# --------------------------
y_train_pred_scaled = model.predict(X_train_scaled)
y_val_pred_scaled = model.predict(X_val_scaled)
y_test_pred_scaled = model.predict(X_test_scaled)

# Inverse transform predictions to original cash flow (€)
y_train_pred = target_scaler.inverse_transform(y_train_pred_scaled)
y_val_pred = target_scaler.inverse_transform(y_val_pred_scaled)
y_test_pred = target_scaler.inverse_transform(y_test_pred_scaled)

# --------------------------
# Evaluation Function
# --------------------------
def evaluate(y_true, y_pred, label):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    print(f"\n{label} Set Performance:")
    print(f"RMSE: €{rmse:,.2f}")
    print(f"MAE:  €{mae:,.2f}")
    print(f"R²:    {r2:.3f}")
    print(f"MAPE:  {mape:.2f}%")
    return rmse, r2, mae, mape

# Evaluate performance
evaluate(y_train, y_train_pred, "Train")
evaluate(y_val, y_val_pred, "Validation")
evaluate(y_test, y_test_pred, "Test")

# --------------------------
# Scatter Plot: Predicted vs Actual (Test Set)
# --------------------------
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_test_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
plt.xlabel("Actual Cash Flow (€)")
plt.ylabel("Predicted Cash Flow (€)")
plt.title("LSTM: Predicted vs Actual (Test Set)")
plt.grid(True)
plt.tight_layout()
plt.show()

# --------------------------
# Monthly Average Plot (2024)
# --------------------------
df_preds = pd.DataFrame({
    "date": sequence_datums[-len(y_test):],
    "y_true": y_test,
    "y_pred": y_test_pred.flatten()
})
df_preds["month"] = pd.to_datetime(df_preds["date"]).dt.month
monthly_avg = df_preds.groupby("month")[["y_true", "y_pred"]].mean()

plt.figure(figsize=(10, 5))
plt.plot(monthly_avg.index, monthly_avg["y_true"], label="Actual", marker="o")
plt.plot(monthly_avg.index, monthly_avg["y_pred"], label="Predicted", marker="o", linestyle="--")
plt.title("LSTM Forecast – Monthly Average Cash Flow (2024)")
plt.xlabel("Month")
plt.ylabel("Average Cash Flow (€)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Collect training dates (if not already available)
train_sequence_datums = []
for boer_id, boer_df in df.groupby("volgnr"):
    boer_df = boer_df.sort_values("datum")
    for i in range(len(boer_df) - sequence_length):
        input_window = boer_df.iloc[i:i + sequence_length]
        target_row = boer_df.iloc[i + sequence_length]
        if input_window["is_train"].all():
            train_sequence_datums.append(target_row["datum"])

# Rebuild DataFrame for plotting
df_plot = pd.DataFrame({
    "datum": pd.to_datetime(train_sequence_datums + sequence_datums),
    "y_true": np.concatenate([y_train.flatten(), y_test.flatten()]),
    "y_pred": np.concatenate([np.full_like(y_train.flatten(), np.nan), y_test_pred.flatten()])
})

# Set date as index
df_plot = df_plot.sort_values("datum").set_index("datum")

# Aggregate by month
monthly_avg = df_plot.resample("M")[["y_true", "y_pred"]].mean()

# Plot: actual vs predicted cash flow (monthly average)
plt.figure(figsize=(12, 5))
plt.plot(monthly_avg.index, monthly_avg["y_true"], label="Actual", color="blue")
plt.plot(monthly_avg.index, monthly_avg["y_pred"], label="Predicted (2024)", color="orangered", linestyle="--")
plt.title("LSTM Forecast Without Exogenous Variables – Monthly Average Cash Flow (2020–2024)")
plt.xlabel("Date")
plt.ylabel("Average Cash Flow (€)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
