# 1.0 Environment setup, module import

In [None]:
# add toolbox to src (THIS WILL NOT BE REQUIRED ONCE DEPLOYED)
import sys
from pathlib import Path
src_path = Path("../../src").resolve()

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

from toolbox.steps.base_step import BaseStep, register_step
import toolbox.utils.diagnostics as diag
import polars as pl
import matplotlib.pyplot as plt
import matplotlib as mpl
import tkinter as tk
import numpy as np

# 2.0 Data import
Non-OG1 formatted ALR data is read in, formatted as a .log text file </br>
For compatability, looking to format into this input format: </br>
```
    Parameters
    ----------
    df : polars.DataFrame
        Input dataframe containing time and depth measurements
    gradient_thresholds : list
        Two-element list [positive_threshold, negative_threshold] defining the vertical velocity
        range (in meters/second) that is NOT considered part of a profile. typical values are around [0.02, -0.02]
    filter_win_sizes : list, default= ['20s', '10s']
        Window sizes for the compound filter applied to gradient calculations, in Polars duration format.
        index 0 controls the rolling median window size and index 1 controls the rolling mean window size.
    time_col : str, default='TIME'
        Name of the column containing timestamp data
    depth_col : str, default='DEPTH'
        Name of the column containing depth measurements
```

In [20]:
# Define the folder containing files
folder = Path("../../test_data")
file_type = '*.log'

# Get a list of all .log files
data_files = [str(file.resolve()) for file in folder.glob(file_type)]

data_files


['C:\\Users\\hanshil\\Documents\\GitHub\\toolbox\\test_data\\CtdSbe52mp_2024-06-11_10-02-50_file_1.log',
 'C:\\Users\\hanshil\\Documents\\GitHub\\toolbox\\test_data\\CtdSbe52mp_2024-06-11_12-55-23_file_1.log',
 'C:\\Users\\hanshil\\Documents\\GitHub\\toolbox\\test_data\\CtdSbe52mp_2024-06-11_18-26-32_file_1.log',
 'C:\\Users\\hanshil\\Documents\\GitHub\\toolbox\\test_data\\CtdSbe52mp_2024-06-11_23-52-39_file_1.log',
 'C:\\Users\\hanshil\\Documents\\GitHub\\toolbox\\test_data\\CtdSbe52mp_2024-06-21_07-36-11_file_1.log',
 'C:\\Users\\hanshil\\Documents\\GitHub\\toolbox\\test_data\\CtdSbe52mp_2024-06-21_20-09-28_file_1.log']

In [22]:
import polars as pl
from pathlib import Path
import re

# Load the first file
file_path = data_files[0]  # assuming you've already built 'data_files'

with open(file_path, "r") as f:
    lines = f.readlines()

# --- Step 1: Find where each data section starts ---
pressure_start = None
full_data_start = None

for i, line in enumerate(lines):
    if "[Timestamp, Pressure]" in line:
        pressure_start = i + 1  # data starts after the header
    elif "[Timestamp, Conductivity" in line:
        full_data_start = i + 1

print(f"Pressure data starts at line: {pressure_start}")
print(f"Full data starts at line: {full_data_start}")

# --- Step 2: Split into two lists of raw data ---
pressure_lines = lines[pressure_start:full_data_start - 1]  # up to just before full data
full_data_lines = lines[full_data_start:]

# --- Step 3: Parse Pressure data ---
pressure_data = pl.read_csv(
    file_path,
    has_header=False,
    skip_rows=pressure_start,
    n_rows=len(pressure_lines),
    new_columns=["Timestamp", "Pressure"],
    separator=",",
    ignore_errors=True
)

# --- Step 4: Parse Full data ---
# Remove the weird 'S>' marker in the second column if it exists
cleaned_full_lines = [re.sub(r"S>\s*", "", line) for line in full_data_lines]

# Temporarily write to memory for polars to read
import io
full_data = pl.read_csv(
    io.StringIO("".join(cleaned_full_lines)),
    has_header=False,
    new_columns=["Timestamp", "Conductivity", "Temperature", "Pressure", "Oxygen"],
    separator=",",
    ignore_errors=True
)

# --- Step 5: Merge the two datasets ---
combined = pressure_data.join(full_data, on="Timestamp", how="outer", suffix="_full")

# Prefer the full-data Pressure when available
combined = combined.with_columns(
    pl.when(pl.col("Pressure_full").is_not_null())
      .then(pl.col("Pressure_full"))
      .otherwise(pl.col("Pressure"))
      .alias("Pressure")
).drop("Pressure_full")

# --- Step 6: Sort by Timestamp
combined = combined.sort("Timestamp")

# Final dataframe
combined

Pressure data starts at line: 26932
Full data starts at line: 26678


Timestamp,Pressure,Timestamp_full,Conductivity,Temperature,Oxygen
f64,str,f64,str,str,str
,""" 0.42""",1.7181e9,"""37.0472""",""" 8.9702""",""" 3444.3"""
,""" 0.43""",1.7181e9,""" 37.0673""",""" 8.9748""",""" 3572.0"""
,""" 0.30""",1.7181e9,""" 37.0392""",""" 8.9917""",""" 3663.1"""
,""" 0.35""",1.7181e9,""" 37.0732""",""" 9.0168""",""" 3726.6"""
,""" 0.35""",1.7181e9,""" 37.0132""",""" 8.9779""",""" 3769.0"""
…,…,…,…,…,…
,,1.7181e9,""" 0.106""",,
,,1.7181e9,""" 0.136""",,
,,1.7181e9,""" 0.134""",,
,,1.7181e9,""" 0.166""",,
