# Compare Local and KNMI Precipitation/Evaporation
This notebook scans all CSV files in `output_files/output_sheets`, fetches KNMI data (2022-01-01 to today), and generates Plotly figures comparing local and KNMI precipitation/evaporation. Each figure is saved as an HTML file in `output_files/output_graphs`.

In [1]:
from datetime import date
from scripts.knmi_pull import fetch_knmi_prec_evap
import os
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go



station = 249  # Berkhout
start_date = "20220101"
end_date = date.today().strftime("%Y%m%d")

prec, evap = fetch_knmi_prec_evap(station, start_date, end_date)

print(f"KNMI data opgehaald voor station {station} van {start_date} tot {end_date}.")
print(f"Aantal dagen neerslag: {len(prec)}")
print(f"Aantal dagen verdamping: {len(evap)}")

# Toon de eerste paar regels
print(prec.head())
print(evap.head())

KNMI data opgehaald voor station 249 van 20220101 tot 20250707.
Aantal dagen neerslag: 1282
Aantal dagen verdamping: 1282
DATE
2022-01-01     0.3
2022-01-02    12.2
2022-01-03     0.0
2022-01-04     3.9
2022-01-05     4.0
Name: Precipitation, dtype: float64
DATE
2022-01-01    0.256794
2022-01-02    0.328475
2022-01-03    0.320264
2022-01-04    0.178555
2022-01-05    0.219556
Name: ET, dtype: float64


In [2]:
# Go up one directory from the current working dir (notebooks → project root)
project_root = Path(os.getcwd()).resolve().parent

# Now build the correct path
local_csv = project_root / 'output_files' / 'output_sheets' / '87097-1 HB178PB01.csv'

print(local_csv)


D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_sheets\87097-1 HB178PB01.csv


In [3]:

# Load local CSV
#local_csv = Path(r'C:/Users/jeroe/Desktop/Python/pastas-wv2030/output_files/output_sheets/87097-1 HB178PB01.csv')
df_local = pd.read_csv(local_csv, parse_dates=['Timestamp'])

# Prepare KNMI DataFrame for full KNMI range
knmi_full = pd.DataFrame({'KNMI_Precipitation': prec, 'KNMI_Evapotranspiration': evap})
knmi_full.index.name = 'Timestamp'
knmi_full = knmi_full.reset_index()
knmi_full['Timestamp'] = pd.to_datetime(knmi_full['Timestamp'])
knmi_full['KNMI_Recharge'] = knmi_full['KNMI_Precipitation'] - knmi_full['KNMI_Evapotranspiration']

# Merge local with KNMI (for local overlays)
merged = pd.merge(df_local, knmi_full, on='Timestamp', how='left')

# Guess local columns
y2_col = next((c for c in df_local.columns if 'waterniveau' in c.lower()), None)
prec_col = next((c for c in df_local.columns if 'neerslag' in c.lower() or 'precip' in c.lower()), None)
evap_col = next((c for c in df_local.columns if 'verdamp' in c.lower() or 'evap' in c.lower()), None)

# Determine crop range
crop_start = df_local['Timestamp'].min()
crop_end = df_local['Timestamp'].max()

# Plot
fig = go.Figure()
# KNMI series (always full range, but cropped in view)
fig.add_trace(go.Scatter(x=knmi_full['Timestamp'], y=knmi_full['KNMI_Precipitation'], name='KNMI Precipitation', yaxis='y1', line=dict(width=1.5)))
fig.add_trace(go.Scatter(x=knmi_full['Timestamp'], y=knmi_full['KNMI_Evapotranspiration'], name='KNMI Evapotranspiration', yaxis='y1', line=dict(width=1.5)))
fig.add_trace(go.Scatter(x=knmi_full['Timestamp'], y=knmi_full['KNMI_Recharge'], name='KNMI Recharge', yaxis='y1', line=dict(width=1.5)))
# Local series (only where present)
if prec_col:
    fig.add_trace(go.Scatter(x=merged['Timestamp'], y=merged[prec_col], name='Local Precipitation', yaxis='y1', line=dict(width=2)))
if evap_col:
    fig.add_trace(go.Scatter(x=merged['Timestamp'], y=merged[evap_col], name='Local Evaporation', yaxis='y1', line=dict(width=2)))
if y2_col:
    fig.add_trace(go.Scatter(x=merged['Timestamp'], y=merged[y2_col], name='Waterniveau', yaxis='y2', line=dict(width=2)))

fig.update_layout(
    title=f'KNMI vs Local Series: {local_csv.stem}',
    xaxis_title='Datum',
    yaxis=dict(title='mm/dag (KNMI & Local)', side='left'),
    yaxis2=dict(title='Waterniveau', overlaying='y', side='right', showgrid=False),
    legend_title='Variabele',
    hovermode='x unified',
    template='plotly_white',
    width=1200,
    height=600,
    xaxis=dict(range=[crop_start, crop_end])
)

# Define the output directory relative to project root
output_dir = project_root / 'output_files' / 'output_graphs'
output_dir.mkdir(parents=True, exist_ok=True)

# Use the CSV stem to name the output file
out_html = output_dir / f'{local_csv.stem}_plot.html'

# Save the plot
fig.write_html(out_html)
print(f'✅ Plot saved to: {out_html}')

# Optional: show the plot in notebook
fig.show()

✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\87097-1 HB178PB01_plot.html


In [4]:
import pandas as pd
import plotly.graph_objects as go
from pathlib import Path

# Define input and output directories relative to the project
input_dir  = project_root / 'output_files' / 'output_sheets'
output_dir = project_root / 'output_files' / 'output_graphs'

# Make sure the output folder exists
output_dir.mkdir(parents=True, exist_ok=True)

# Pre-fetch KNMI data
# (Assumes prec and evap are already defined in this session;
#  otherwise call fetch_knmi_prec_evap as before and build knmi_full.)
knmi_full = pd.DataFrame({
    'KNMI_Precipitation': prec,
    'KNMI_Evapotranspiration': evap
})
knmi_full.index.name = 'Timestamp'
knmi_full = knmi_full.reset_index()
knmi_full['Timestamp'] = pd.to_datetime(knmi_full['Timestamp'])
knmi_full['KNMI_Recharge'] = (
    knmi_full['KNMI_Precipitation'] - knmi_full['KNMI_Evapotranspiration']
)

# Loop over every CSV in the input folder
for local_csv in input_dir.glob('*.csv'):
    df_local = pd.read_csv(local_csv, parse_dates=['Timestamp'])

    # Merge with full-range KNMI
    merged = pd.merge(
        df_local,
        knmi_full,
        on='Timestamp',
        how='left'
    )

    # Identify columns
    y2_col   = next((c for c in df_local.columns if 'waterniveau' in c.lower()), None)
    prec_col = next((c for c in df_local.columns if 'neerslag' in c.lower() or 'precip' in c.lower()), None)
    evap_col = next((c for c in df_local.columns if 'verdamp' in c.lower() or 'evap' in c.lower()), None)

    # Determine plot range
    crop_start = df_local['Timestamp'].min()
    crop_end   = df_local['Timestamp'].max()

    # Build figure
    fig = go.Figure()
    # KNMI traces (full date index, but view-cropped)
    fig.add_trace(go.Scatter(
        x=knmi_full['Timestamp'], y=knmi_full['KNMI_Precipitation'],
        name='KNMI Precipitation', yaxis='y1', line=dict(width=1.5)
    ))
    fig.add_trace(go.Scatter(
        x=knmi_full['Timestamp'], y=knmi_full['KNMI_Evapotranspiration'],
        name='KNMI Evapotranspiration', yaxis='y1', line=dict(width=1.5)
    ))
    fig.add_trace(go.Scatter(
        x=knmi_full['Timestamp'], y=knmi_full['KNMI_Recharge'],
        name='KNMI Recharge', yaxis='y1', line=dict(width=1.5)
    ))
    # Local traces
    if prec_col:
        fig.add_trace(go.Scatter(
            x=merged['Timestamp'], y=merged[prec_col],
            name='Local Precipitation', yaxis='y1', line=dict(width=2)
        ))
    if evap_col:
        fig.add_trace(go.Scatter(
            x=merged['Timestamp'], y=merged[evap_col],
            name='Local Evaporation', yaxis='y1', line=dict(width=2)
        ))
    if y2_col:
        fig.add_trace(go.Scatter(
            x=merged['Timestamp'], y=merged[y2_col],
            name='Waterniveau', yaxis='y2', line=dict(width=2)
        ))

    # Layout
    fig.update_layout(
        title=f'KNMI vs Local Series: {local_csv.stem}',
        xaxis_title='Datum',
        yaxis=dict(title='mm/dag (KNMI & Local)', side='left'),
        yaxis2=dict(
            title='Waterniveau', overlaying='y',
            side='right', showgrid=False
        ),
        legend_title='Variabele',
        hovermode='x unified',
        template='plotly_white',
        width=1200,
        height=600,
        xaxis=dict(range=[crop_start, crop_end])
    )


    # Save the plot
    out_html = output_dir / f'{local_csv.stem}.html'   # Use the CSV stem to name the output file
    fig.write_html(out_html)
    print(f'✅ Plot saved to: {out_html}') 

✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\83034-1 HB015PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\86349-1 HB002PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\86349-1 HB007PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\86349-1 HB009PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\86349-1 HB010PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\86349-1 HB011PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\86349-1 HB013PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\86349-1 HB014PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs\86349-1 HB015PB01.html
✅ Plot saved to: D:\Users\jvanruitenbeek\pastas-wv2030\

## Daily Median Values

In [7]:
import pandas as pd
import plotly.graph_objects as go
from pathlib import Path

# Define input and output directories relative to the project
input_dir  = project_root / 'output_files' / 'output_sheets'
output_dir = project_root / 'output_files' / 'output_graphs_median'

# Make sure the output folder exists
output_dir.mkdir(parents=True, exist_ok=True)

# Pre-fetch KNMI data
knmi_full = pd.DataFrame({
    'KNMI_Precipitation': prec,
    'KNMI_Evapotranspiration': evap
})
knmi_full.index.name = 'Timestamp'
knmi_full = knmi_full.reset_index()
knmi_full['Timestamp'] = pd.to_datetime(knmi_full['Timestamp'])
knmi_full['KNMI_Recharge'] = (
    knmi_full['KNMI_Precipitation'] - knmi_full['KNMI_Evapotranspiration']
)

# Loop over every CSV in the input folder
for local_csv in input_dir.glob('*.csv'):
    df_local = pd.read_csv(local_csv, parse_dates=['Timestamp'])

    # Identify relevant columns
    y2_col   = next((c for c in df_local.columns if 'waterniveau' in c.lower()), None)
    prec_col = next((c for c in df_local.columns if 'neerslag' in c.lower() or 'precip' in c.lower()), None)
    evap_col = next((c for c in df_local.columns if 'verdamp' in c.lower() or 'evap' in c.lower()), None)

    # Resample water level to daily median
    if y2_col:
        df_local[y2_col] = pd.to_numeric(df_local[y2_col], errors='coerce')
        df_y2_median = (
            df_local.set_index('Timestamp')[[y2_col]]
            .resample('D')
            .median()
            .reset_index()
        )
    else:
        df_y2_median = None

    # Merge full local with KNMI
    merged = pd.merge(
        df_local,
        knmi_full,
        on='Timestamp',
        how='left'
    )

    # Determine plot range
    crop_start = df_local['Timestamp'].min()
    crop_end   = df_local['Timestamp'].max()

    # Build figure
    fig = go.Figure()
    # KNMI traces (full date index, but view-cropped)
    fig.add_trace(go.Scatter(
        x=knmi_full['Timestamp'], y=knmi_full['KNMI_Precipitation'],
        name='KNMI Precipitation', yaxis='y1', line=dict(width=1.5)
    ))
    fig.add_trace(go.Scatter(
        x=knmi_full['Timestamp'], y=knmi_full['KNMI_Evapotranspiration'],
        name='KNMI Evapotranspiration', yaxis='y1', line=dict(width=1.5)
    ))
    fig.add_trace(go.Scatter(
        x=knmi_full['Timestamp'], y=knmi_full['KNMI_Recharge'],
        name='KNMI Recharge', yaxis='y1', line=dict(width=1.5)
    ))
    # Local traces
    if prec_col:
        fig.add_trace(go.Scatter(
            x=merged['Timestamp'], y=merged[prec_col],
            name='Local Precipitation', yaxis='y1', line=dict(width=2)
        ))
    if evap_col:
        fig.add_trace(go.Scatter(
            x=merged['Timestamp'], y=merged[evap_col],
            name='Local Evaporation', yaxis='y1', line=dict(width=2)
        ))
    if df_y2_median is not None:
        fig.add_trace(go.Scatter(
            x=df_y2_median['Timestamp'], y=df_y2_median[y2_col],
            name='Waterniveau (Daily Median)', yaxis='y2', line=dict(width=2)
        ))

    # Layout
    fig.update_layout(
        title=f'KNMI vs Local Series (Daily Median Waterniveau): {local_csv.stem}',
        xaxis_title='Datum',
        yaxis=dict(title='mm/dag (KNMI & Local)', side='left'),
        yaxis2=dict(
            title='Waterniveau', overlaying='y',
            side='right', showgrid=False
        ),
        legend_title='Variabele',
        hovermode='x unified',
        template='plotly_white',
        width=1200,
        height=600,
        xaxis=dict(range=[crop_start, crop_end])
    )

    # Save the plot
    out_html = output_dir / f'{local_csv.stem}.html'
    fig.write_html(out_html)
    print(f'✅ Plot with daily median saved to: {out_html}')

✅ Plot with daily median saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs_median\83034-1 HB015PB01.html
✅ Plot with daily median saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs_median\86349-1 HB002PB01.html
✅ Plot with daily median saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs_median\86349-1 HB007PB01.html
✅ Plot with daily median saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs_median\86349-1 HB009PB01.html
✅ Plot with daily median saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs_median\86349-1 HB010PB01.html
✅ Plot with daily median saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs_median\86349-1 HB011PB01.html
✅ Plot with daily median saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\output_graphs_median\86349-1 HB013PB01.html
✅ Plot with daily median saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\outp

## Data validation output excel

In [8]:
# Define input and output directories
input_dir  = project_root / 'output_files' / 'output_sheets'
output_dir = project_root / 'output_files'

# List to collect validation results
validation_data = []

# Loop over each CSV
for local_csv in input_dir.glob('*.csv'):
    df_local = pd.read_csv(local_csv, parse_dates=['Timestamp'])

    # Identify the water level column
    y2_col = next((c for c in df_local.columns if 'waterniveau' in c.lower()), None)

    if y2_col is None:
        continue  # Skip files without water level data

    # Convert to numeric and resample daily median
    df_local[y2_col] = pd.to_numeric(df_local[y2_col], errors='coerce')
    df_y2_median = (
        df_local.set_index('Timestamp')[[y2_col]]
        .resample('D')
        .median()
        .rename(columns={y2_col: 'Waterniveau'})
    )

    # Basic date range info
    first_entry = df_y2_median.index.min()
    last_entry = df_y2_median.index.max()
    full_range = pd.date_range(start=first_entry, end=last_entry, freq='D')

    # Check for large jumps
    jumps = df_y2_median['Waterniveau'].diff().abs() > 0.5
    has_jumps = jumps.any()

    # Calculate number of missing days (no median value after resample)
    missing_days = df_y2_median['Waterniveau'].isna().sum()

    # Compile validation record
    validation_data.append({
        'ID': local_csv.stem,
        'Name': local_csv.name,
        'Jumps >0.5m': has_jumps,
        'First Entry': first_entry.date(),
        'Last Entry': last_entry.date(),
        'Length (days)': len(full_range),
        'Days without data': missing_days
    })

# Convert to DataFrame and export to Excel
validation_df = pd.DataFrame(validation_data)
output_file = output_dir / 'data_validation.xlsx'
validation_df.to_excel(output_file, index=False)
print(f'✅ Data validation summary saved to: {output_file}')


✅ Data validation summary saved to: D:\Users\jvanruitenbeek\pastas-wv2030\output_files\data_validation.xlsx
