In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.ticker import ScalarFormatter

# Load and prepare data
file_path = "data_with_clusters.csv"
df = pd.read_csv(file_path, parse_dates=["datum"])
df = df.sort_values("datum").reset_index(drop=True)

# Column translation map
column_name_map = {
    'mutaties_vorderingen_en_schulden': 'Changes in Receivables and Debts',
    'crediteuren': 'Creditors',
    'kas': 'Cash',
    'resultaat_vóór_bijzondere_resultaten': 'Result Before Special Items',
    'eindsaldo_liquide_middelen': 'Final Cash Balance',
    'melkprijs_per_kg': 'Milk Price per kg',
    'voorschot_melkgeld': 'Milk Money Advance',
    'totaal_opbrengsten': 'Total Revenue',
    'aantal_melkkoeien': 'Number of Dairy Cows',
    'aantal_kalveren_dood_tot_14_dagen': 'Calves Died <14 Days',
    'gemiddelde_temperatuur': 'Average Temperature (°C)',
    'neerslag_(mm)': 'Rainfall (mm)',
    'totale_kasstroom': 'Total Cash Flow',
    'voerkosten': 'Feed Costs'
}

# Grouping features
financial_features = [
    'mutaties_vorderingen_en_schulden', 'crediteuren', 'kas',
    'resultaat_vóór_bijzondere_resultaten', 'eindsaldo_liquide_middelen'
]
milk_features = ['melkprijs_per_kg', 'voorschot_melkgeld', 'totaal_opbrengsten']
livestock_features = ['aantal_melkkoeien', 'aantal_kalveren_dood_tot_14_dagen']
weather_features = ['gemiddelde_temperatuur', 'neerslag_(mm)']
target_column = 'totale_kasstroom'

feature_groups = {
    "Financial": financial_features,
    "Milk": milk_features,
    "Livestock": livestock_features,
    "Weather": weather_features
}

# Summary and Histograms
for group_name, features in feature_groups.items():
    english_labels = [column_name_map.get(f, f) for f in features]
    data = df[features].copy()
    data.columns = english_labels

    print(f"\n===== {group_name} =====")
    display(data.describe())

    # Histograms
    data.hist(bins=30, figsize=(12, 6))
    plt.suptitle(f"Histograms of {group_name} Features", fontsize=16)
    for ax, label in zip(plt.gcf().axes, english_labels):
        ax.set_xlabel(label)
        ax.set_ylabel("Frequency")
        ax.xaxis.set_major_formatter(ScalarFormatter(useMathText=True))
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

    # Boxplots
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=data)
    plt.title(f"Boxplots of {group_name} Features", fontsize=14)
    plt.ylabel("Value")
    plt.xticks(ticks=range(len(english_labels)), labels=english_labels, rotation=45)
    plt.tight_layout()
    plt.show()

# Correlation matrix with target
numeric_df = df.select_dtypes(include=["number"]).copy()
corr_matrix = numeric_df.corr()
top_corr = corr_matrix[target_column].abs().sort_values(ascending=False).head(20)
top_corr_features = top_corr.index
top_corr_english = [column_name_map.get(col, col) for col in top_corr_features]

# Prepare correlation plot
corr_subset = df[top_corr_features].copy()
corr_subset.columns = top_corr_english

plt.figure(figsize=(10, 8))
sns.heatmap(corr_subset.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Top 20 Features Most Correlated with Total Cash Flow", fontsize=16)
plt.tight_layout()
plt.show()

# Time series plots
time_series_cols = ['totale_kasstroom', 'melkprijs_per_kg', 'voerkosten']
for col in time_series_cols:
    english_col = column_name_map.get(col, col)
    plt.figure(figsize=(14, 5))
    sns.lineplot(x='datum', y=col, data=df)
    plt.title(f"Time Series of {english_col}", fontsize=14)
    plt.xlabel("Date")
    plt.ylabel(english_col)
    plt.gca().xaxis.set_major_locator(mdates.YearLocator())
    plt.xticks(rotation=45)
    plt.grid()
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd

df = pd.read_csv("data_with_clusters.csv")
summary = df["totale_kasstroom"].describe()
print(summary)


In [None]:
import pandas as pd

df = pd.read_csv("data_with_clusters.csv")

print(df.shape)
print(df.dtypes.value_counts())
print(df.isnull().sum().sort_values(ascending=False).head(15))


In [None]:

corrs = df.corr(numeric_only=True)["totale_kasstroom"].sort_values(ascending=False)
top_15 = corrs[1:16]  


top_15_english = top_15.rename(index=lambda x: column_name_map.get(x, x))

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=top_15_english.values, y=top_15_english.index)
plt.title("Top 15 Variables Correlated with Total Cash Flow", fontsize=14)
plt.xlabel("Correlation")
plt.ylabel("Variable")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate 3-month rolling average of total cash flow per farm
df['cash_flow_rolling'] = df.groupby("volgnr")["totale_kasstroom"].transform(lambda x: x.rolling(3).mean())

# Calculate monthly average across all farms
df_avg = df.groupby("datum")[["totale_kasstroom", "cash_flow_rolling"]].mean()

# Plot overall time series with rolling average
plt.figure(figsize=(12, 5))
plt.plot(df_avg.index, df_avg["totale_kasstroom"], label="Raw", linewidth=1.5)
plt.plot(df_avg.index, df_avg["cash_flow_rolling"], label="Rolling Mean (3 months)", color="black", linewidth=2)
plt.title("Total Cash Flow with 3-Month Rolling Average", fontsize=14)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Total Cash Flow (€)", fontsize=12)
plt.legend(fontsize=11)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Per-cluster time series plots
g = sns.FacetGrid(df, col="bedrijf_cluster", col_wrap=3, height=4, sharey=False)
g.map_dataframe(sns.lineplot, x="datum", y="totale_kasstroom")
g.set_titles("Cluster {col_name}", size=12)
g.set_axis_labels("Date", "Total Cash Flow (€)")
g.fig.suptitle("Total Cash Flow Over Time by Cluster", fontsize=16, y=1.05)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

# Load and sort data
df = pd.read_csv("data_with_clusters.csv", parse_dates=["datum"])
df = df.sort_values("datum")

# Calculate average monthly total cash flow across all farms
df_monthly = df.groupby("datum")["totale_kasstroom"].mean()

# Rename series to English for clean subplot titles
df_monthly.name = "Total Cash Flow"

# Ensure regular monthly frequency and interpolate missing values
df_monthly = df_monthly.asfreq("MS")
df_monthly = df_monthly.interpolate()

# Perform seasonal decomposition
result = seasonal_decompose(df_monthly, model="additive", period=12)

# Plot decomposition
fig = result.plot()
fig.set_size_inches(12, 8)

# Set main title
plt.suptitle("Seasonal Decomposition of Total Cash Flow", fontsize=13)

# Update subplot y-axis labels with larger font
subplot_labels = ["Observed", "Trend", "Seasonal", "Residual"]
for ax, label in zip(fig.axes, subplot_labels):
    ax.set_ylabel(label, fontsize=12)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import matplotlib.pyplot as plt

# Laad en aggregeer de dataset
df = pd.read_csv("data_with_clusters.csv", parse_dates=["datum"])
df_monthly = df.groupby("datum")["totale_kasstroom"].mean()

# Plot de tijdreeks
plt.figure(figsize=(10, 4))
plt.plot(df_monthly, label="Totale kasstroom (gemiddeld per maand)")
plt.title("Tijdreeks van Totale Kasstroom (Gemiddeld)")
plt.xlabel("Datum")
plt.ylabel("Totale Kasstroom (€)")
plt.grid(True)
plt.tight_layout()
plt.legend()
plt.show()

# Augmented Dickey-Fuller test
result = adfuller(df_monthly.dropna())

print("🔎 ADF Test Resultaten")
print(f"ADF Statistic: {result[0]:.4f}")
print(f"p-waarde: {result[1]:.4f}")
for key, value in result[4].items():
    print(f"Critical Value ({key}): {value:.4f}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acovf

# Load and prepare dataset
df = pd.read_csv("data_with_clusters.csv", parse_dates=["datum"])
df = df.sort_values("datum")

# Average total cash flow per month across all farms
df = df.groupby("datum")[["totale_kasstroom"]].mean()
df = df.dropna()

# Dynamically determine number of lags (max 40% of data length)
max_lags = int(len(df) * 0.4)

# Autocorrelation Function (ACF) plot
plt.figure(figsize=(10, 4))
plot_acf(df["totale_kasstroom"], lags=max_lags, alpha=0.05)
plt.title("Autocorrelation Function (ACF) of Total Cash Flow", fontsize=13)
plt.xlabel("Lag")
plt.ylabel("Correlation")
plt.tight_layout()
plt.show()

# Partial Autocorrelation Function (PACF) plot
plt.figure(figsize=(10, 4))
plot_pacf(df["totale_kasstroom"], lags=max_lags, alpha=0.05, method="ywm")
plt.title("Partial Autocorrelation Function (PACF) of Total Cash Flow", fontsize=13)
plt.xlabel("Lag")
plt.ylabel("Partial Correlation")
plt.tight_layout()
plt.show()

# Autocovariance plot
acov = acovf(df["totale_kasstroom"], demean=True, fft=True)
lags = range(max_lags)

plt.figure(figsize=(10, 4))
plt.stem(lags, acov[:max_lags], basefmt=" ")
plt.title("Autocovariance of Total Cash Flow", fontsize=13)
plt.xlabel("Lag")
plt.ylabel("Autocovariance")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from statsmodels.tsa.stattools import grangercausalitytests

# Load dataset and sort by farm and date
df = pd.read_csv("data_with_clusters.csv", parse_dates=["datum"])
df = df.sort_values(["volgnr", "datum"]).reset_index(drop=True)

# Aggregate monthly average values per date (national level)
monthly_avg = df.groupby("datum").agg({
    "totale_kasstroom": "mean",
    "melkprijs_per_kg": "mean",
    "voerkosten": "mean",
    "totaal_opbrengsten": "mean",
    "krachtvoerkosten": "mean",
    "neerslag_(mm)": "mean",
    "gemiddelde_temperatuur": "mean"
}).dropna()

# Rename columns to English
monthly_avg = monthly_avg.rename(columns={
    "totale_kasstroom": "total_cash_flow",
    "melkprijs_per_kg": "milk_price_per_kg",
    "voerkosten": "feed_costs",
    "totaal_opbrengsten": "total_revenue",
    "krachtvoerkosten": "concentrate_feed_costs",
    "neerslag_(mm)": "rainfall_mm",
    "gemiddelde_temperatuur": "average_temperature"
})

# Variables to test for Granger causality
max_lag = 6
variables = [
    "milk_price_per_kg",
    "feed_costs",
    "total_revenue",
    "concentrate_feed_costs",
    "rainfall_mm",
    "average_temperature"
]

# Perform Granger causality tests
results = {}

for var in variables:
    print(f"\nGranger Causality Test: Does '{var}' cause 'total_cash_flow'?")
    test_result = grangercausalitytests(
        monthly_avg[["total_cash_flow", var]],
        maxlag=max_lag,
        verbose=True
    )
    results[var] = test_result


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("data_winsorized_financials.csv", parse_dates=["datum"])

# Extract month from date
df["month"] = df["datum"].dt.month

# Count number of zero values per row (excluding irrelevant columns)
columns_to_check = df.drop(columns=["datum", "month", "volgnr"], errors="ignore")
df["zero_count"] = (columns_to_check == 0).sum(axis=1)

# Aggregate total number of zeros per month
zeros_per_month = df.groupby("month")["zero_count"].sum()

# Visualization
plt.figure(figsize=(10, 5))
zeros_per_month.plot(kind="bar", color="steelblue")

# Title and axis labels
plt.title("Total Number of Zero Values per Calendar Month", fontsize=14)
plt.xlabel("Month", fontsize=12)
plt.ylabel("Number of Zeros", fontsize=12)

# Custom x-axis labels
plt.xticks(
    ticks=range(0, 12),
    labels=["Jan", "Feb", "Mar", "Apr", "May", "Jun",
            "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
    rotation=45,
    fontsize=10
)

# Style
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("data_winsorized_financials.csv")

# Select only numeric columns (optionally exclude target variable)
numeric_df = df.select_dtypes(include=[np.number])

# Compute absolute correlation matrix
corr_matrix = numeric_df.corr().abs()

# Extract upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Identify features with high correlation (r > 0.9)
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Report results
print(f"{len(to_drop)} highly correlated features found (correlation > 0.9):")
print(to_drop)


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("data_met_lagged_features.csv")

# Parse the date column and extract year-month
df['datum'] = pd.to_datetime(df['datum'], errors='coerce')
df['year_month'] = df['datum'].dt.to_period('M')

# Define realistic price boundaries for milk and feed prices
min_price = 0
max_price = 5

# Identify outliers outside the realistic range
milk_price_outliers = (df['melkprijs_per_kg'] < min_price) | (df['melkprijs_per_kg'] > max_price)
feed_price_outliers = (df['voerprijs'] < min_price) | (df['voerprijs'] > max_price)

# Count and print the number of outliers
milk_outlier_count = milk_price_outliers.sum()
feed_outlier_count = feed_price_outliers.sum()

print(f"Milk price outliers: {milk_outlier_count} ({milk_outlier_count / len(df):.2%} of total)")
print(f"Feed price outliers: {feed_outlier_count} ({feed_outlier_count / len(df):.2%} of total)")

# Calculate monthly means from valid (non-outlier) values
monthly_milk_mean = df.loc[~milk_price_outliers].groupby('year_month')['melkprijs_per_kg'].mean()
monthly_feed_mean = df.loc[~feed_price_outliers].groupby('year_month')['voerprijs'].mean()

# Impute outliers with the corresponding monthly mean
df.loc[milk_price_outliers, 'melkprijs_per_kg'] = df.loc[milk_price_outliers, 'year_month'].map(monthly_milk_mean)
df.loc[feed_price_outliers, 'voerprijs'] = df.loc[feed_price_outliers, 'year_month'].map(monthly_feed_mean)

# Drop the temporary 'year_month' column
df.drop(columns='year_month', inplace=True)

# Optional: Save the corrected dataset
# df.to_csv("data_corrected.csv", index=False)


In [None]:
import pandas as pd

df = pd.read_csv("data_final.csv", parse_dates=["datum"])

required_cols = {"bedrijf_cluster", "totale_kasstroom"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"Vereiste kolommen ontbreken: {required_cols - set(df.columns)}")

kasstroom_stats = df.groupby("bedrijf_cluster")["totale_kasstroom"].describe()

print("Descriptieve statistieken van totale kasstroom per cluster:\n")
print(kasstroom_stats)

kasstroom_stats.to_csv("kasstroom_stats_per_cluster.csv")
