In [None]:
# Discover repo root and read all CSV files from the per-series folders
from pathlib import Path
import pandas as pd
import sys

# Find repo root
repo_root = Path.cwd()
for candidate in [repo_root] + list(repo_root.parents):
    if (candidate / 'pyproject.toml').exists() or (candidate / '.git').exists():
        repo_root = candidate
        break

# Add repo root to sys.path (BEFORE the import attempt)
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
    print(f'Added {repo_root} to sys.path')

# Directory containing hourly CSVs
wiertsema_dir = repo_root / 'output_data' / 'hourly_csv_wiertsema_filtered'
fugro_dir = repo_root / 'output_data' / 'hourly_csv_fugro_filtered'
out_fig = repo_root / 'output_data' / 'figures'
out_fig.mkdir(parents=True, exist_ok=True)

print('wiertsema_dir ->', wiertsema_dir)
print('fugro_dir    ->', fugro_dir)

wiertsema_dir -> d:\Users\jvanruitenbeek\data_validation\output_data\hourly_csv_wiertsema_filtered
fugro_dir    -> d:\Users\jvanruitenbeek\data_validation\output_data\hourly_csv_fugro_filtered


# Loop over folder and generate report from the filtered files

In [7]:
# Loop over filtered CSV files and generate validation report

import numpy as np

# Choose folder: wiertsema or fugro
folder_to_process = fugro_dir  # Change to fugro_dir if needed

# Get all CSV files
csv_files = sorted(folder_to_process.glob('*.csv'))
print(f'Found {len(csv_files)} CSV files in {folder_to_process.name}\n')

# Initialize list to store metrics
report_data = []

# Loop over each file
for i, csv_file in enumerate(csv_files, start=1):
    try:
        print(f'[{i}/{len(csv_files)}] Processing: {csv_file.name}')
        
        # Read the CSV
        df = pd.read_csv(
            csv_file,
            index_col=0,
            parse_dates=True,
            encoding="utf-8-sig",
            encoding_errors="replace"
        )
        
        # Coerce head column to numeric
        df['head'] = pd.to_numeric(df['head'], errors='coerce')
        head_series = df['head'].dropna()
        
        if len(head_series) == 0:
            print(f'  ✗ No valid head data\n')
            continue
        
        # Calculate metrics
        first_entry = head_series.index[0]
        last_entry = head_series.index[-1]
        num_entries = len(head_series)
        
        # Time span in days
        time_span_days = (last_entry - first_entry).days
        
        # Completeness: entries with values / possible entries (hourly)
        possible_entries = (time_span_days * 24) + 1  # +1 to include both endpoints
        completeness_pct = (num_entries / possible_entries * 100) if possible_entries > 0 else 0
        
        # Largest gap: find the largest time difference between consecutive non-NaN entries
        time_diffs = head_series.index.to_series().diff().dt.total_seconds() / 3600  # Convert to hours
        largest_gap_hours = time_diffs.max() if len(time_diffs) > 1 else 0
        
        # Min and max values
        lowest_value = head_series.min()
        highest_value = head_series.max()
        
        # Largest jump: maximum absolute change between consecutive entries
        head_diff = head_series.diff().abs()
        largest_jump = head_diff.max() if len(head_diff) > 1 else 0
        
        # Count outliers (non-NaN values in outliers_boxplot column)
        outliers_count = df['outliers_boxplot'].notna().sum() if 'outliers_boxplot' in df.columns else 0
        
        # Count jumps/drops (non-NaN values in jumps_drops column)
        jumps_drops_count = df['jumps_drops'].notna().sum() if 'jumps_drops' in df.columns else 0
        
        # Add to report
        report_data.append({
            'Filename': csv_file.stem,
            'First Entry': first_entry,
            'Last Entry': last_entry,
            'Number of Entries': num_entries,
            'Completeness (%)': round(completeness_pct, 2),
            'Length (days)': time_span_days,
            'Largest Gap (hours)': round(largest_gap_hours, 2),
            'Lowest Value (m)': round(lowest_value, 4),
            'Highest Value (m)': round(highest_value, 4),
            'Largest Jump (m)': round(largest_jump, 4),
            'Outliers (count)': outliers_count,
            'Jumps/Drops (count)': jumps_drops_count
        })
        
        print(f'  ✓ Metrics calculated\n')
        
    except Exception as e:
        print(f'  ✗ Error processing {csv_file.name}: {e}\n')

# Create DataFrame from report data
report_df = pd.DataFrame(report_data)

# Save to Excel
output_file = repo_root / 'output_data' / f'validation_report_{folder_to_process.name}.xlsx'
report_df.to_excel(output_file, index=False, sheet_name='Validation Report')

print(f'✓ Report saved to: {output_file}')
print(f'\nSummary Statistics:')
print(report_df.describe())

Found 164 CSV files in hourly_csv_fugro_filtered

[1/164] Processing: NL-2412417-HWM_B09-PB1_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[2/164] Processing: NL-2412417-HWM_B09-PB2_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[3/164] Processing: NL-2412417-HWM_B12-PB1_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[4/164] Processing: NL-2412417-HWM_B13-PB1_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[5/164] Processing: NL-2412417-HWM_B13-PB2_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[6/164] Processing: NL-2412417-HWM_B14-PB1_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[7/164] Processing: NL-2412417-HWM_B14-PB2_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[8/164] Processing: NL-2412417-HWM_B15-PB1_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[9/164] Processing: NL-2412417-HWM_B15-PB2_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[10/164] Processing: NL-2412417-HWM_B16-PB1_m_NAP_avg_processed.csv
  ✓ Metrics calculated

[11/164] Processing: NL-2412417-HWM_B16