In [7]:
import pandas as pd
from datetime import datetime, timedelta
from dscreator.storage import get_storage_handler
from dscreator.sources.ferrybox.extractor import TrajectoryExtractor, NamedTrajectory, NamedArray
from sqlalchemy import create_engine
from dscreator.config import SETTINGS
from dscreator.datasets.trajectories.ferrybox import FerryboxTrajBuilder
import numpy as np

In [17]:
def apply_manual_qc(traj_raw, boat, year) -> NamedTrajectory:
    bad_ox = pd.read_csv(f'Norsoop-manual-qc-files/{boat}{year}_bad_ox_sat_dates.txt', header=None)
    bad_T = pd.read_csv(f'Norsoop-manual-qc-files/{boat}{year}_bad_inletT_dates.txt', header=None)
    bad_T = np.array([dt.to_pydatetime() for dt in pd.to_datetime(bad_T[bad_T.columns[0]]).to_list()])
    bad_ox = np.array([dt.to_pydatetime() for dt in pd.to_datetime(bad_ox[bad_ox.columns[0]]).to_list()])
    i_bad_T = np.intersect1d(np.array(traj_raw.datetime_list), bad_T, return_indices=True)[1]
    i_bad_ox = np.intersect1d(np.array(traj_raw.datetime_list), bad_ox, return_indices=True)[1]
    print(f"Found {len(i_bad_T)} timestamp with no flow, and {len(i_bad_ox)} with bad oxygen")
    print(f"{len(np.intersect1d(i_bad_T, i_bad_ox))} timestamps overlap")
    print(f"About to set values for oxygen to None for bad oxygen timestamps")
    traj = NamedTrajectory(
        array_list=[NamedArray(nta.variable_name, [None if i in i_bad_ox else val for i,val in enumerate(nta.values)])
                    if nta.variable_name=="Oxygen" else nta for nta in traj_raw.array_list],
        datetime_list=traj_raw.datetime_list,
        locations=traj_raw.locations)

    print(f"About to remove bad flow data. Before removal size of data based on location is "
          f"{len(traj.locations)}")
    traj = NamedTrajectory(
        array_list=[NamedArray(nta.variable_name, list(np.delete(nta.values, i_bad_T)))
                    for nta in traj.array_list],
        datetime_list=list(np.delete(traj.datetime_list, i_bad_T)),
        locations=list(np.delete(traj.locations, i_bad_T)))
    print(f"After removal size of data based on location is {len(traj.locations)}")

    return traj

In [15]:
boat = "FA"
measurement_parameters = ["Temperature", "Salinity", "Oxygen"]
uuid="29b7de62-e1fa-4dce-90e4-7ff8a0931397"
datasetname="colorfantasy_ferrybox"
stationname="color_fantasy"
projectname="NorSoop"

In [18]:
year = 2017
start_time = datetime(year, 1, 1, 0, 0, 0)
engine = create_engine(SETTINGS.database_url)
te = TrajectoryExtractor(engine, boat, measurement_parameters)
traj_raw = te.fetch_slice(start_time=start_time, end_time=start_time + timedelta(days=365))

In [19]:
tb = FerryboxTrajBuilder(
    uuid=uuid,
    dataset_name=datasetname,
    station_name=stationname,
    project_name=projectname,
    is_acdd=True,
)
ds = tb.create(apply_manual_qc(traj_raw, boat, year))

Found 11566 timestamp with no flow, and 12638 with bad oxygen
6025 timestamps overlap
About to set values for oxygen to None for bad oxygen timestamps
About to remove bad flow data. Before removal size of data based on location is 351678
After removal size of data based on location is 340112


In [216]:
sh= get_storage_handler(
    project_name=datasetname,
    dataset_name=stationname,
    unlimited_dims=["time"],
    filename_prefix= "FA"
)
sh.save_dataset(ds)


Saving dataset slice 2017-01-01T13:25:03.000000000 --> 2017-01-30T23:59:23.000000000
Dataset was successfully saved
