# Setting up the notebook and realtive paths

In [15]:
# Discover repo root and read all CSV files from the per-series folders
from pathlib import Path
import pandas as pd
import sys

# Find repo root
repo_root = Path.cwd()
for candidate in [repo_root] + list(repo_root.parents):
    if (candidate / 'pyproject.toml').exists() or (candidate / '.git').exists():
        repo_root = candidate
        break

# Add repo root to sys.path (BEFORE the import attempt)
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
    print(f'Added {repo_root} to sys.path')

# Import the plot_head_distribution function
try:
    from functions.plot_functions import plot_head_distribution
    from functions.plot_functions import plot_flagged_head_timeseries
except ImportError as e:
    print('✗ Failed to import plot_head_distribution:', e)
    print('  Verify that functions/plot_functions.py exists and contains the function.')
finally:
    print('Import attempt finished.')

functions_dir = repo_root / 'functions'
wiertsema_dir = repo_root / 'output_data' / 'hourly_csv_wiertsema_knmi_unfiltered'
fugro_dir = repo_root / 'output_data' / 'hourly_csv_fugro_knmi_unfiltered'
# Directory containing meteorological/stressor CSVs
stressor_dir = repo_root / 'input_stressors'
# Explicit stressor file paths used elsewhere in notebooks
precip_path = stressor_dir / 'Neerslag_2021_2025.csv'
evap_path = stressor_dir / 'Verdamping_2021_2025.csv'

out_fig = repo_root / 'output_data' / 'figures'
out_fig.mkdir(parents=True, exist_ok=True)

print('wiertsema_dir ->', wiertsema_dir)
print('fugro_dir    ->', fugro_dir)
print('precip_path ->', precip_path)
print('evap_path  ->', evap_path)

Import attempt finished.
wiertsema_dir -> d:\Users\jvanruitenbeek\data_validation\output_data\hourly_csv_wiertsema_knmi_unfiltered
fugro_dir    -> d:\Users\jvanruitenbeek\data_validation\output_data\hourly_csv_fugro_knmi_unfiltered
precip_path -> d:\Users\jvanruitenbeek\data_validation\input_stressors\Neerslag_2021_2025.csv
evap_path  -> d:\Users\jvanruitenbeek\data_validation\input_stressors\Verdamping_2021_2025.csv


# Creating the boxplots for a folder

In [None]:
# Loop over all Fugro CSV files and generate head distribution plots

# Create output directory for plots
boxplot_output_dir = out_fig / 'wiertsema_data_distribution'
boxplot_output_dir.mkdir(parents=True, exist_ok=True)

# FIXME: Set folder here 
# Get all CSV files from fugro folder
input_boxplot_csv_folder = sorted(wiertsema_dir.glob('*.csv'))
print(f'Found {len(input_boxplot_csv_folder)} CSV files\n')

# Loop over each file
for i, csv_file in enumerate(input_boxplot_csv_folder, start=1):
    try:
        print(f'[{i}/{len(input_boxplot_csv_folder)}] Processing: {csv_file.name}')
        
        # Read the CSV
        df = pd.read_csv(
            csv_file,
            index_col=0,
            parse_dates=True,
            encoding="utf-8-sig",
            encoding_errors="replace"
        )
        
        # Coerce all columns to numeric
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
        
        # Select the first numeric column as the head data
        numeric_cols = df.select_dtypes(include=['number']).columns
        if len(numeric_cols) == 0:
            print(f'  ✗ No numeric columns found\n')
            continue
        
        # The function expects a DataFrame with a "head" column, so rename the column
        head_df = df[[numeric_cols[0]]].rename(columns={numeric_cols[0]: 'head'})
        
        # Generate plot using the imported function
        fig = plot_head_distribution(head_df, title=f'Head Distribution - {csv_file.stem}')
        
        # Save as HTML
        output_file = boxplot_output_dir / f'{csv_file.stem}.html'
        fig.write_html(str(output_file))
        print(f'  ✓ Saved: {output_file.name}\n')
        
    except Exception as e:
        print(f'  ✗ Error processing {csv_file.name}: {e}\n')

print(f'✓ All plots saved to {boxplot_output_dir}')

# Creating the marked head plots for a folder 

In [20]:
# Loop over all CSV files and generate flagged head time series plots

# Create output directory for plots
timeseries_output_dir = out_fig / 'fugro_outliers_marked'
timeseries_output_dir.mkdir(parents=True, exist_ok=True)

# Get all CSV files from wiertsema folder
input_timeseries_csv_folder = sorted(fugro_dir.glob('*.csv'))
print(f'Found {len(input_timeseries_csv_folder)} CSV files\n')

# Loop over each file
for i, csv_file in enumerate(input_timeseries_csv_folder, start=1):
    try:
        print(f'[{i}/{len(input_timeseries_csv_folder)}] Processing: {csv_file.name}')
        
        # Read the CSV
        df = pd.read_csv(
            csv_file,
            index_col=0,
            parse_dates=True,
            encoding="utf-8-sig",
            encoding_errors="replace"
        )
        
        # Coerce all columns to numeric
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
        
        # Rename the first numeric column to "head" for compatibility
        numeric_cols = df.select_dtypes(include=['number']).columns
        if len(numeric_cols) == 0:
            print(f'  ✗ No numeric columns found\n')
            continue
        
        # Rename only if needed (check if 'head' is already present)
        if 'head' not in df.columns and numeric_cols[0] != 'head':
            df = df.rename(columns={numeric_cols[0]: 'head'})
        
        # Generate time series plot using the imported function with full dataframe
        fig = plot_flagged_head_timeseries(df, title=f'Head Time Series - {csv_file.stem}')
        
        # Save as HTML
        output_file = timeseries_output_dir / f'{csv_file.stem}.html'
        fig.write_html(str(output_file))
        print(f'  ✓ Saved: {output_file.name}\n')
        
    except Exception as e:
        print(f'  ✗ Error processing {csv_file.name}: {e}\n')

print(f'✓ All time series plots saved to {timeseries_output_dir}')

Found 164 CSV files

[1/164] Processing: NL-2412417-HWM_B09-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B09-PB1_m_NAP_avg_processed.html

[2/164] Processing: NL-2412417-HWM_B09-PB2_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B09-PB2_m_NAP_avg_processed.html

[3/164] Processing: NL-2412417-HWM_B12-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B12-PB1_m_NAP_avg_processed.html

[4/164] Processing: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.html

[5/164] Processing: NL-2412417-HWM_B13-PB2_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B13-PB2_m_NAP_avg_processed.html

[6/164] Processing: NL-2412417-HWM_B14-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B12-PB1_m_NAP_avg_processed.html

[4/164] Processing: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.html

[5/164] Processing: NL-2412417-HWM_B13-PB2_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-H

# Creating Validated CSV files

In [19]:
# Loop over all CSV files and add outlier/jump/drop detection columns

# Create output directory for filtered CSVs
filtered_output_dir = repo_root / 'output_data' / 'hourly_csv_fugro_filtered'
filtered_output_dir.mkdir(parents=True, exist_ok=True)

# Get all CSV files from wiertsema folder
input_csv_files = sorted(fugro_dir.glob('*.csv'))
print(f'Found {len(input_csv_files)} CSV files\n')

# Loop over each file
for i, csv_file in enumerate(input_csv_files, start=1):
    try:
        print(f'[{i}/{len(input_csv_files)}] Processing: {csv_file.name}')
        
        # Read the CSV
        df = pd.read_csv(
            csv_file,
            index_col=0,
            parse_dates=True,
            encoding="utf-8-sig",
            encoding_errors="replace"
        )
        
        # Coerce all columns to numeric
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
        
        # Initialize new columns with NaN
        df['outliers_boxplot'] = pd.NA
        df['jumps_drops'] = pd.NA
        
        # Detect outliers using boxplot IQR method on head column
        head_series = df['head'].dropna()
        if len(head_series) > 0:
            Q1 = head_series.quantile(0.25)
            Q3 = head_series.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outlier_mask = (df['head'] < lower_bound) | (df['head'] > upper_bound)
            df.loc[outlier_mask, 'outliers_boxplot'] = df.loc[outlier_mask, 'head']
        
        # Detect jumps and drops on head_t1 column
        if 'head_t1' in df.columns:
            jump_mask = df['head_t1'] > 0.3
            drop_mask = df['head_t1'] < -0.05
            combined_mask = jump_mask | drop_mask
            df.loc[combined_mask, 'jumps_drops'] = df.loc[combined_mask, 'head']
        
        # Select and reorder columns
        output_df = df[['head', 'Precipitation', 'Evapotranspiration', 'recharge', 'head_t1', 'outliers_boxplot', 'jumps_drops']]
        
        # Save to CSV
        output_file = filtered_output_dir / f'{csv_file.stem}.csv'
        output_df.index.name = 'timestamp'
        output_df.to_csv(output_file, encoding="utf-8-sig")
        
        print(f'  ✓ Saved: {output_file.name}\n')
        
    except Exception as e:
        print(f'  ✗ Error processing {csv_file.name}: {e}\n')

print(f'✓ All filtered CSVs saved to {filtered_output_dir}')

Found 164 CSV files

[1/164] Processing: NL-2412417-HWM_B09-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B09-PB1_m_NAP_avg_processed.csv

[2/164] Processing: NL-2412417-HWM_B09-PB2_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B09-PB2_m_NAP_avg_processed.csv

[3/164] Processing: NL-2412417-HWM_B12-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B12-PB1_m_NAP_avg_processed.csv

[4/164] Processing: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.csv

[5/164] Processing: NL-2412417-HWM_B13-PB2_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B13-PB2_m_NAP_avg_processed.csv

[6/164] Processing: NL-2412417-HWM_B14-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B12-PB1_m_NAP_avg_processed.csv

[4/164] Processing: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.csv

[5/164] Processing: NL-2412417-HWM_B13-PB2_m_NAP_avg_processed.csv
  ✓ Saved: NL-2412417-HWM_B13-