In [1]:
import os
import pandas as pd
from dataclasses import  asdict
from datetime import datetime, timedelta
from dscreator.storage import get_storage_handler
from dscreator.sources.ferrybox.extractor import TrajectoryExtractor, NamedTrajectory, NamedArray
from sqlalchemy import create_engine
from dscreator.config import SETTINGS
from dscreator.datasets.trajectories.ferrybox import NorsoopFantasy
from dscreator.sources.ferrybox.uuid_variable_code_mapper import MAPPER
import numpy as np

import xarray as xr

In [2]:
def apply_manual_qc(traj_raw, boat, year) -> NamedTrajectory:
    """Helper to read and apply manually checked data to a named trajectory
    """
    bad_ox = pd.read_csv(f'Norsoop-manual-qc-files/{boat}{year}_bad_ox_sat_dates.txt', header=None)
    bad_T = pd.read_csv(f'Norsoop-manual-qc-files/{boat}{year}_bad_inletT_dates.txt', header=None)
    bad_T = np.array([dt.to_pydatetime() for dt in pd.to_datetime(bad_T[bad_T.columns[0]]).to_list()])
    bad_ox = np.array([dt.to_pydatetime() for dt in pd.to_datetime(bad_ox[bad_ox.columns[0]]).to_list()])
    i_bad_T = np.intersect1d(np.array(traj_raw.datetime_list), bad_T, return_indices=True)[1]
    i_bad_ox = np.intersect1d(np.array(traj_raw.datetime_list), bad_ox, return_indices=True)[1]
    print(f"Found {len(i_bad_T)} timestamp with no flow, and {len(i_bad_ox)} with bad oxygen")
    print(f"{len(np.intersect1d(i_bad_T, i_bad_ox))} timestamps overlap")
    print(f"About to set values for oxygen to None for bad oxygen timestamps")
    traj = NamedTrajectory(
        array_list=[NamedArray(nta.variable_name, [None if i in i_bad_ox else val for i,val in enumerate(nta.values)])
                    if nta.variable_name=="Oxygen" else nta for nta in traj_raw.array_list],
        datetime_list=traj_raw.datetime_list,
        locations=traj_raw.locations)

    print(f"About to remove bad flow data. Before removal size of data based on location is "
          f"{len(traj.locations)}")
    traj = NamedTrajectory(
        array_list=[NamedArray(nta.variable_name, list(np.delete(nta.values, i_bad_T)))
                    for nta in traj.array_list],
        datetime_list=list(np.delete(traj.datetime_list, i_bad_T)),
        locations=list(np.delete(traj.locations, i_bad_T)))
    print(f"After removal size of data based on location is {len(traj.locations)}")

    return traj

In [3]:
boat = "FA"
measurement_parameters = ["Temperature", "Salinity", "Oxygen"]
uuid="14bb8759-81d8-4a1a-948a-14219d374fab"
stationname="color_fantasy"
datasetname=stationname
projectname="NorSoop"

tb = NorsoopFantasy(
    uuid=uuid,
    dataset_name=datasetname,
    station_name=stationname,
    project_name=projectname,
    is_acdd=False
)
years = [2017, 2018, 2019, 2020, 2021, 2022]

In [4]:
engine = create_engine(SETTINGS.database_url)
variable_uuid_map = MAPPER[f"{boat}_19"]
traj_extractor = TrajectoryExtractor(engine, measurement_parameters, variable_uuid_map, [1])
nc_paths = []
for year in years:
    print(f"Extracting year {year}")
    if year == 2020:
        variable_uuid_map = MAPPER[f"{boat}_20"]
        traj_extractor = TrajectoryExtractor(engine, measurement_parameters, variable_uuid_map, [1])

    start_time = datetime(year, 1, 1, 0, 0, 0)
    traj_raw = traj_extractor.fetch_slice(start_time=start_time, end_time=start_time + timedelta(days=365))
    ds = tb.create(apply_manual_qc(traj_raw, boat, year))

    # Set missing value flags
    ds.temperature_qc[ds.temperature.isnull()] = 9
    ds.salinity_qc[ds.salinity.isnull()] = 9
    ds.oxygen_qc[ds.oxygen.isnull()] = 9
    # Store each year on disk after extraction
    SETTINGS.storage_path = os.path.join(os.getcwd(), "..", "catalog")
    sh = get_storage_handler(
        project_name=projectname,
        dataset_name=datasetname,
        unlimited_dims=["time"],
        filename_prefix= f"FA_{year}"
    )
    fname = sh.save_dataset(ds)
    print(f"Dumped {fname.split('/')[-1]}")
    nc_paths.append(fname)

Extracting year 2017
Found 11568 timestamp with no flow, and 12638 with bad oxygen
6025 timestamps overlap
About to set values for oxygen to None for bad oxygen timestamps
About to remove bad flow data. Before removal size of data based on location is 351685
After removal size of data based on location is 340117
Dumped FA_2017_color_fantasy.nc
Extracting year 2018
Found 24961 timestamp with no flow, and 4276 with bad oxygen
1409 timestamps overlap
About to set values for oxygen to None for bad oxygen timestamps
About to remove bad flow data. Before removal size of data based on location is 352954
After removal size of data based on location is 327993
Dumped FA_2018_color_fantasy.nc
Extracting year 2019
Found 23216 timestamp with no flow, and 454 with bad oxygen
117 timestamps overlap
About to set values for oxygen to None for bad oxygen timestamps
About to remove bad flow data. Before removal size of data based on location is 323603
After removal size of data based on location is 30038

# Merge each year of data

In [5]:

ds = xr.merge([xr.open_dataset(p) for p in nc_paths])

refresh attributes based on full dataset

In [6]:
ds.attrs = asdict(tb.dataset_attributes(ds))
ds

In [7]:
SETTINGS.storage_path = "gs://nivatest-1-senda"
sh = get_storage_handler(
        project_name=projectname,
        dataset_name=datasetname,
        unlimited_dims=["time"],
        filename_prefix= f"merged"
)
sh.save_dataset(ds)

'gs://nivatest-1-senda/datasets/norsoop/color_fantasy/merged_acdd_color_fantasy.nc'

# Store local csv version

In [8]:
for y in years:
    ds.sel(time=f"{y}").to_dataframe().to_csv(f"{y}_acdd_color_fantasy.csv")