In [None]:
import os
import pandas as pd
from dataclasses import  asdict
from datetime import datetime, timedelta
from dscreator.storage import get_storage_handler
from dscreator.sources.ferrybox.extractor import TrajectoryExtractor
from sqlalchemy import create_engine
from dscreator.config import SETTINGS
from dscreator.datasets.trajectories.ferrybox import NorsoopFantasy
from dscreator.sources.ferrybox.uuid_variable_code_mapper import MAPPER
import numpy as np

import xarray as xr

In [None]:
def apply_manual_qc(traj_raw: dict, boat: str, year: str) -> dict:
    """Helper to read and apply manually checked data to a named trajectory
    """
    bad_ox = pd.read_csv(f'Norsoop-manual-qc-files/{boat}{year}_bad_ox_sat_dates.txt', header=None)
    bad_T = pd.read_csv(f'Norsoop-manual-qc-files/{boat}{year}_bad_inletT_dates.txt', header=None)
    bad_T = np.array([dt.to_pydatetime() for dt in pd.to_datetime(bad_T[bad_T.columns[0]]).to_list()])
    bad_ox = np.array([dt.to_pydatetime() for dt in pd.to_datetime(bad_ox[bad_ox.columns[0]]).to_list()])
    i_bad_T = np.intersect1d(np.array(traj_raw["time"]), bad_T, return_indices=True)[1]
    i_bad_ox = np.intersect1d(np.array(traj_raw["time"]), bad_ox, return_indices=True)[1]
    print(f"Found {len(i_bad_T)} timestamp with no flow, and {len(i_bad_ox)} with bad oxygen_sat")
    print(f"{len(np.intersect1d(i_bad_T, i_bad_ox))} timestamps overlap")
    print(f"About to set values for oxygen to None for bad oxygen timestamps")

    traj_raw["oxygen_sat"] = [None if i in i_bad_ox else val for i,val in enumerate(traj_raw["oxygen_sat"])]

    print(f"About to remove bad flow data. Before removal size of data based on location is "
          f"{len(traj_raw['longitude'])}")
    for k in traj_raw:
        traj_raw[k] = np.delete(traj_raw[k], i_bad_T)
    print(f"After removal size of data based on location is {len(traj_raw['longitude'])}")

    return traj_raw

In [None]:
boat = "FA"
measurement_parameters = ["temperature", "salinity", "oxygen_sat"]
stationname="color_fantasy"
datasetname=stationname
projectname="NorSoop"

tb = NorsoopFantasy(
    uuid="no.niva:14bb8759-81d8-4a1a-948a-14219d374fab",
    dataset_name=datasetname,
    station_name=stationname,
    grouping=projectname,
    is_acdd=False
)
years = [2017, 2018, 2019, 2020, 2021, 2022]

In [None]:
engine = create_engine(SETTINGS.tsb_connection_str)
variable_uuid_map = MAPPER[f"{boat}_19"]
traj_extractor = TrajectoryExtractor(engine, measurement_parameters, variable_uuid_map, [1])
nc_paths = []
for year in years:
    print(f"Extracting year {year}")
    if year == 2020:
        variable_uuid_map = MAPPER[f"{boat}_20"]
        traj_extractor = TrajectoryExtractor(engine, measurement_parameters, variable_uuid_map, [1])
    elif year == 2022:
        variable_uuid_map = MAPPER[f"{boat}_22"]
        traj_extractor = TrajectoryExtractor(engine, measurement_parameters, variable_uuid_map, [1])

    start_time = datetime(year, 1, 1, 0, 0, 0)
    traj_raw = traj_extractor.fetch_slice(start_time=start_time, end_time=start_time + timedelta(days=365))
    ds = tb.create(apply_manual_qc(traj_raw, boat, year))

    # Set missing value flags
    ds.temperature_qc[ds.temperature.isnull()] = 9
    ds.salinity_qc[ds.salinity.isnull()] = 9
    ds.oxygen_sat_qc[ds.oxygen_sat.isnull()] = 9
    # Store each year on disk after extraction
    SETTINGS.storage_path = os.path.join(os.getcwd(), "..", "catalog")
    sh = get_storage_handler(
        grouping=projectname,
        dataset_name=datasetname,
        unlimited_dims=["time"],
        filename_prefix= f"FA_{year}"
    )
    fname = sh.save_dataset(ds)
    print(f"Dumped {fname.split('/')[-1]}")
    nc_paths.append(fname)

# Merge each year of data

In [None]:

ds = xr.merge([xr.open_dataset(p) for p in nc_paths])

refresh attributes based on full dataset

In [None]:
ds.attrs = asdict(tb.dataset_attributes(ds))
ds.attrs['id'] = tb.uuid
ds

# Flag data outside bounding box

In [None]:
lat_mask = (ds.latitude >= ds.attrs["geospatial_lat_min"]) & (ds.latitude <= ds.attrs["geospatial_lat_max"])
lon_mask = (ds.longitude >= ds.attrs["geospatial_lon_min"]) & (ds.longitude <= ds.attrs["geospatial_lon_max"])
bounding_mask = (lon_mask&lat_mask)
# Set flag on suspicious positions
ds.temperature_qc[~bounding_mask] = 3
ds.salinity_qc[~bounding_mask] = 3
ds.oxygen_sat_qc[~bounding_mask] = 3

# Store dataset on object storage

In [None]:
SETTINGS.storage_path = "gs://nivaprod-1-senda"
sh = get_storage_handler(
        grouping=projectname,
        dataset_name=datasetname,
        unlimited_dims=["time"],
        filename_prefix= f"merged"
)
sh.save_dataset(ds)

# Store local csv version

In [None]:
for year in years:
    ds.sel(time=str(year)).to_dataframe().to_csv(f"{year}_acdd_color_fantasy.csv")