# Hello

In [13]:
from pathlib import Path
from pprint import pprint

# path relative to repository root; if you run the notebook from `notebooks/` adjust accordingly
data_dir = Path('..') / 'input_data' / 'Fugro'
files = sorted(data_dir.glob('*.csv'))
print(f'Looking in: {data_dir.resolve()}')
print(f'Found {len(files)} CSV file(s)')
pprint([str(p) for p in files])

Looking in: D:\Users\jvanruitenbeek\data_validation\input_data\Fugro
Found 4 CSV file(s)
['..\\input_data\\Fugro\\4423-241417_PB_HHW_01-01-2023 00_00_00_07-10-2025 '
 '00_00_00_Uur_20251007112142.csv',
 '..\\input_data\\Fugro\\4423-241417_PB_HOORN_01-01-2023 00_00_00_07-10-2025 '
 '00_00_00_Uur_20251007112422.csv',
 '..\\input_data\\Fugro\\4424-260484_HHW_normaal_01-01-2023 '
 '00_00_00_07-10-2025 00_00_00_Uur_20251007113401.csv',
 '..\\input_data\\Fugro\\4424_Hoorn_Zuiderdijk_01-01-2023 00_00_00_07-10-2025 '
 '00_00_00_Uur_20251007112613.csv']


In [12]:
from pathlib import Path
import sys

def find_repo_root(start=Path.cwd()):
    """Return the first ancestor (including start) that contains .git or pyproject.toml."""
    p = start.resolve()
    for candidate in [p] + list(p.parents):
        if (candidate / '.git').exists() or (candidate / 'pyproject.toml').exists():
            return candidate
    return p

repo_root = find_repo_root()
print('Repository root detected as:', repo_root)

# Build the path to the desired file under the repo root. Change filename if needed.
filename = '4423-241417_PB_HHW_01-01-2023 00_00_00_07-10-2025 00_00_00_Uur_20251007112142.csv'
csv_path = repo_root / 'input_data' / 'Fugro' / filename

if not csv_path.exists():
    print('File not found:', csv_path.resolve())
else:
    print('Reading first 5 lines of:', csv_path.name)
    with csv_path.open('r', encoding='utf-8', errors='replace') as f:
        for i, line in enumerate(f, start=1):
            print(f'{i}: {line.rstrip()}')
            if i >= 10:
                break

Repository root detected as: D:\Users\jvanruitenbeek\data_validation
Reading first 5 lines of: 4423-241417_PB_HHW_01-01-2023 00_00_00_07-10-2025 00_00_00_Uur_20251007112142.csv
1: ﻿Time,NL-253677-FB-FLB5073#: HB-SS060_BIT_PB1 [-4.7|-5.7][m NAP] (avg),NL-253677-FB-FLB5176#: HB-SS059_BIT_PB1 [-4.3|-5.3][m NAP] (avg),NL-253677-FB-FLB5207#: HB-WM044_KR_PB1 [-3.2|-4.2][m NAP] (avg),NL-253677-FB-LB1004#: HB-GA035_BIT_PB1 [-4.2|-5.2][m NAP] (avg),NL-253677-FB-LB1064#: HB-GA039_BIT_PB1 [-3.2|-4.2][m NAP] (avg),NL-253677-FB-LB1083#: HB-GA040_BIT_PB1 [-3.3|-4.3][m NAP] (avg),NL-253677-FB-LB1112#: HB-SS077_BITA_PB1 [-3.1|-4.1][m NAP] (avg),NL-253677-FB-LB1157#: HB-GP019_BITA_PB1 [-4.3|-5.3][m NAP] (avg),NL-253677-FB-LB1188#: PB_SCH3_18_BIT_PB1 [-2.5|-3.5][m NAP] (avg),NL-253677-FB-LB1267#: HB-GP026_BITA_PB1 [-3.1|-4.1][m NAP] (avg),NL-253677-FB-LB1274#: HB_GP010_BITA_PB1 [-3.7|-4.7][m NAP] (avg),NL-253677-FB-LB1298#: PB_SCH3_12_KR_PB1 [-0.9|-1.9][m NAP] (avg),NL-253677-FB-LB1303#: HB-WM046_AL_PB1

In [16]:
from pathlib import Path
import re
import io
import pandas as pd

def find_repo_root(start=Path.cwd()):
    p = start.resolve()
    for candidate in [p] + list(p.parents):
        if (candidate / '.git').exists() or (candidate / 'pyproject.toml').exists():
            return candidate
    return p

repo_root = find_repo_root()
print('Repository root detected as:', repo_root)

filename = '4423-241417_PB_HHW_01-01-2023 00_00_00_07-10-2025 00_00_00_Uur_20251007112142.csv'
csv_path = repo_root / 'input_data' / 'Fugro' / filename

Repository root detected as: D:\Users\jvanruitenbeek\data_validation


In [18]:
from pathlib import Path
import re
import pandas as pd


def sanitize_for_filename(name: str) -> str:
    """
    Make a string safe for filenames on Windows/Linux/macOS by replacing
    unwanted characters with underscores and collapsing repeats.
    """
    safe = re.sub(r'[^0-9A-Za-z._-]+', '_', str(name))
    safe = safe.strip(' ._')
    return safe or "series"


def process_fugro_csv(input_file, repo_root, dayfirst=True, drop_all_nan=True):
    """
    Process a Fugro-style CSV:
      - Keeps timestamps as index
      - Writes each timeseries column to its own CSV named after the column
      - Outputs saved under <repo_root>/output_data/only_csv_fugro
    """
    input_path = Path(input_file)
    repo_root = Path(repo_root)

    # Output directory
    out_dir = repo_root / "output_data" / "only_csv_fugro"
    out_dir.mkdir(parents=True, exist_ok=True)

    # Read CSV, handling possible BOM in "﻿Time"
    df = pd.read_csv(
        input_path,
        sep=",",
        header=0,
        dtype="object",
        encoding="utf-8-sig",
        engine="python",
    )

    # Normalize first column name to "Time"
    first_col = str(df.columns[0]).replace("\ufeff", "").strip()
    if first_col.lower() != "time":
        if "time" in first_col.lower():
            df.rename(columns={df.columns[0]: "Time"}, inplace=True)
        else:
            df.rename(columns={df.columns[0]: "Time"}, inplace=True)

    # Parse datetime and set as index
    df["Time"] = pd.to_datetime(df["Time"], dayfirst=dayfirst, errors="coerce")
    df = df.dropna(subset=["Time"]).set_index("Time")

    written = []
    seen_names = {}

    for col in df.columns:
        ser = pd.to_numeric(df[col], errors="coerce")

        # Skip empty columns if requested
        if drop_all_nan and ser.notna().sum() == 0:
            continue

        # Sanitize and deduplicate names
        base_name = sanitize_for_filename(col)
        count = seen_names.get(base_name, 0)
        out_name = base_name if count == 0 else f"{base_name}_{count}"
        seen_names[base_name] = count + 1

        out_path = out_dir / f"{out_name}.csv"
        ser.to_csv(out_path, index=True, header=[col], index_label="Time")
        print(f"Saved {out_path} ({ser.notna().sum()} rows with data)")
        written.append(out_path)

    print(f"Done. Saved {len(written)} series to {out_dir}")
    return written

In [22]:
#--- Example usage ---
process_fugro_csv(
    input_file=r"D:\Users\jvanruitenbeek\data_validation\input_data\Fugro\4424-260484_HHW_normaal_01-01-2023 00_00_00_07-10-2025 00_00_00_Uur_20251007113401.csv",
    repo_root=r"D:\Users\jvanruitenbeek\data_validation"
)

Saved D:\Users\jvanruitenbeek\data_validation\output_data\only_csv_fugro\NL-260484-FB-FLB5014_HB11PB01_-2.0_-3.0_m_NAP_avg.csv (6244 rows with data)
Saved D:\Users\jvanruitenbeek\data_validation\output_data\only_csv_fugro\NL-260484-FB-FLB5017_B26PB01_-6.7_-7.7_m_NAP_avg.csv (5974 rows with data)
Saved D:\Users\jvanruitenbeek\data_validation\output_data\only_csv_fugro\NL-260484-FB-FLB5017_B26PB02_-3.2_-4.2_m_NAP_avg.csv (5932 rows with data)
Saved D:\Users\jvanruitenbeek\data_validation\output_data\only_csv_fugro\NL-260484-FB-FLB5022_HB16PB01_-3.2_-4.2_m_NAP_avg.csv (6117 rows with data)
Saved D:\Users\jvanruitenbeek\data_validation\output_data\only_csv_fugro\NL-260484-FB-FLB5026_HB19PB01_-2.5_-3.5_m_NAP_avg.csv (5517 rows with data)
Saved D:\Users\jvanruitenbeek\data_validation\output_data\only_csv_fugro\NL-260484-FB-FLB5032_HB22PB01_-3.6_-4.6_m_NAP_avg.csv (5951 rows with data)
Saved D:\Users\jvanruitenbeek\data_validation\output_data\only_csv_fugro\NL-260484-FB-FLB5033_B19PB01_-5.8_-

[WindowsPath('D:/Users/jvanruitenbeek/data_validation/output_data/only_csv_fugro/NL-260484-FB-FLB5014_HB11PB01_-2.0_-3.0_m_NAP_avg.csv'),
 WindowsPath('D:/Users/jvanruitenbeek/data_validation/output_data/only_csv_fugro/NL-260484-FB-FLB5017_B26PB01_-6.7_-7.7_m_NAP_avg.csv'),
 WindowsPath('D:/Users/jvanruitenbeek/data_validation/output_data/only_csv_fugro/NL-260484-FB-FLB5017_B26PB02_-3.2_-4.2_m_NAP_avg.csv'),
 WindowsPath('D:/Users/jvanruitenbeek/data_validation/output_data/only_csv_fugro/NL-260484-FB-FLB5022_HB16PB01_-3.2_-4.2_m_NAP_avg.csv'),
 WindowsPath('D:/Users/jvanruitenbeek/data_validation/output_data/only_csv_fugro/NL-260484-FB-FLB5026_HB19PB01_-2.5_-3.5_m_NAP_avg.csv'),
 WindowsPath('D:/Users/jvanruitenbeek/data_validation/output_data/only_csv_fugro/NL-260484-FB-FLB5032_HB22PB01_-3.6_-4.6_m_NAP_avg.csv'),
 WindowsPath('D:/Users/jvanruitenbeek/data_validation/output_data/only_csv_fugro/NL-260484-FB-FLB5033_B19PB01_-5.8_-6.8_m_NAP_avg.csv'),
 WindowsPath('D:/Users/jvanruitenbeek