In [26]:
import argparse
import numpy as np
import logging
import datetime
import xarray as xr
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import re
import shutil

from src.data.data_reading import TRS_common_read_H, TRS_common_read_Q, rename_wavelengths

matchups_TRS = ["matchup_TRS2", "matchup_TRS6", "matchup_TRS1", "matchup_TRS3", "eastern_med"]

wavelengths = [400, 412, 442, 490, 510, 560, 620, 665, 673, 681, 708, 778, 865]
mini_wavelengths = [412, 442, 490,      560,           673]


pigments = ['chlide_a[mg*m^3]', 'chla[mg*m^3]', 'chlb[mg*m^3]', 'chlc1+c2[mg*m^3]',
               'fucox[mg*m^3]', "19'hxfcx[mg*m^3]", "19'btfcx[mg*m^3]", "diadino[mg*m^3]",
               "allox[mg*m^3]", "diatox[mg*m^3]", "zeaxan[mg*m^3]", "beta_car[mg*m^3]", 
               "peridinin[mg*m^3]"]



mini_pigments = ['chla[mg*m^3]', 'chlb[mg*m^3]', 'chlc1+c2[mg*m^3]', 'fucox[mg*m^3]', 
                    "19'hxfcx[mg*m^3]", "diadino[mg*m^3]", "diatox[mg*m^3]", "beta_car[mg*m^3]", 
                    "peridinin[mg*m^3]"]
drop_pigm_indx = [0, 6, 8, 10]


months_keys = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]


raw_data_dir = Path('../../data/raw/dataset_hplc_multi/')

processed_data_dir = Path('../../data/processed/dataset_hplc_multi/')

dataset_dir = Path('../../data/datasets/dataset_hplc_multi')

In [27]:
raw_dir = raw_data_dir

x_list = []
y_list = []
all_refs = []
for match in matchups_TRS:
    raw_files = [x.name for x in (raw_dir/match).iterdir() if x.is_file()]
    for raw_data in tqdm(raw_files):
        if "Q" in raw_data:
            continue
            # xar = to_xarray_Q(d)
            # xar.to_netcdf(path_save)
        elif "H" in raw_data:
            file_path = str(raw_dir / match / raw_data)
            new_data = TRS_common_read_H(file_path)
            for data in new_data:
                file_path_Q = Path(file_path[:-1] + 'Q.srf')
                if data['station_ref'] not in all_refs and file_path_Q.exists():
                    y_list.append(data)
                    all_refs.append(data['station_ref'])
                    new_data = TRS_common_read_Q(file_path_Q)
                    x_list.append(new_data)
            # xar = to_xarray_H(d)
            # xar.to_netcdf(path_save)
        else:
            logging.info(f'File: {raw_data} is not treated')

x_list = [rename_wavelengths(x) for x in x_list]
x_ar = xr.concat(x_list, dim='Id').assign_coords({'Id': range(len(x_list))})
x_ar['rrs'] = np.maximum(x_ar['rrs'], 0)

y_ar = pd.DataFrame(y_list).to_xarray()
y_ar = y_ar.rename({'index': 'Id'}).assign_coords({'Id': range(len(x_list))})
indices = ~x_ar['rrs'].isnull().any(axis=1)
x_ar = x_ar.sel(Id=indices)
y_ar = y_ar.sel(Id=indices)

y_ar["pigments"] = (("Id", "pigment"), y_ar[pigments].to_array().values.T)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 309/309 [00:00<00:00, 381.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 3111.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 136/136 [00:00<00:00, 1680.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 2026.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 276.03it/s]


#### Location

In [28]:
import geopandas as gpd
from shapely.geometry import Point

def points_in_region(lats, lons, region_polygon):
    """Check if a list of lat/lon points are inside the Mediterranean Sea."""
    results = []
    for lat, lon in zip(lats, lons):
        point = Point(lon, lat)  # Shapely uses (x, y) => (longitude, latitude)
        results.append(region_polygon.contains(point))
    return results

In [29]:

med = gpd.read_file('../../data/raw/Mediterranean.geojson').union_all()
black_sea = gpd.read_file('../../data/raw/BlackSea.geojson').union_all()

x_ar["Med"] =  ("Id", points_in_region(x_ar['lat'], x_ar['lon'], med))
y_ar["Med"] =  ("Id", x_ar["Med"].values)

x_ar["Black Sea"] =  ("Id", points_in_region(x_ar['lat'], x_ar['lon'], black_sea))
y_ar["Black Sea"] =  ("Id", x_ar["Black Sea"].values)

x_ar["Med and Black Sea"] =  ("Id", x_ar["Med"].values + x_ar["Black Sea"].values )
y_ar["Med and Black Sea"] =  ("Id", x_ar["Med"].values + x_ar["Black Sea"].values )


#### Month

In [30]:
x_ar["month"] = ('Id', x_ar['sampling_date'].dt.month.values)

In [31]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories=[list(range(1, 13))])
months_arr = ohe.fit_transform(x_ar['month'].values.reshape(-1, 1)).toarray()

x_ar = x_ar.assign({month: ('Id', vals) for month, vals in zip(months_keys, months_arr.T)})

#### Pigments Threshold

In [32]:
y_vals = y_ar[pigments]
y_threshold = xr.where(y_vals > 0, y_vals, np.inf)
y_subthreshold = y_threshold.quantile(0.05).to_array().values
y_ar = y_ar.assign_coords({'pigment': pigments})
y_ar["th_pigments"] = (("Id", "pigment"), np.maximum(y_ar["pigments"].values, y_subthreshold))

In [33]:
y_subthreshold

array([0.00248, 0.05878, 0.003  , 0.00518, 0.003  , 0.01302, 0.0036 ,
       0.00968, 0.001  , 0.0018 , 0.00844, 0.00242, 0.001  ])

#### RRS Threshold

In [34]:
x_vals = x_ar['rrs']
x_threshold = xr.where(x_vals > 0, x_vals, np.inf)
x_subthreshold = np.quantile(x_threshold, 0.05, axis=0)
x_ar["th_rrs"] = (("Id", "wavelength"), np.maximum(x_ar["rrs"].values, x_subthreshold))

In [35]:
x_subthreshold

array([2.3161020e-03, 2.4659540e-03, 2.7858400e-03, 3.1531280e-03,
       2.9284400e-03, 1.5058740e-03, 2.3582312e-04, 1.2848600e-04,
       1.2293400e-04, 1.1542496e-04, 4.2764140e-05, 1.0518880e-05,
       3.5480000e-06])

#### Mini data

In [36]:
# mini rrs
x_ar = x_ar.assign_coords({'mini_wavelength': mini_wavelengths})
x_ar["mini_rrs"]    = (("Id", "mini_wavelength"), x_ar['rrs'].sel(wavelength=mini_wavelengths).values)
x_ar["mini_th_rrs"] = (("Id", "mini_wavelength"), x_ar['th_rrs'].sel(wavelength=mini_wavelengths).values)

# mini pigments
y_ar = y_ar.assign_coords({'mini_pigment': mini_pigments})
y_ar["mini_pigments"] = (("Id", "mini_pigment"), y_ar[mini_pigments].to_array().values.T)
y_subthreshold_mini = np.delete(y_subthreshold, drop_pigm_indx)
y_ar["th_mini_pigments"] = (("Id", "mini_pigment"), np.maximum(y_ar["mini_pigments"].values, y_subthreshold_mini))

#### Log transformation

In [37]:
## Log transformations
x_ar["log_rrs"] = (("Id", "wavelength"), np.log(x_ar['th_rrs'].values))
x_ar["log_mini_rrs"] = (("Id", "mini_wavelength"), np.log(x_ar['mini_th_rrs'].values))


y_ar["log_pigments"] = (("Id", "pigment"), np.log(y_ar['th_pigments'].values))
y_ar["log_mini_pigments"] = (("Id", "mini_pigment"), np.log(y_ar['th_mini_pigments'].values))

In [38]:
x_ar[['January', 'February', 'March']].to_array().sum(axis=0).data

array([1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

#### Seasons

In [39]:
x_ar["winter"] = ("Id", x_ar[['January', 'February', 'March']].to_array().sum(axis=0).data)
x_ar["spring"] = ("Id", x_ar[['April', 'May', 'June']].to_array().sum(axis=0).data)
x_ar["summer"] = ("Id", x_ar[['July', 'August', 'September']].to_array().sum(axis=0).data)
x_ar["autumn"] = ("Id", x_ar[['October', 'November', 'December']].to_array().sum(axis=0).data)


#### Depth

In [40]:
depth_grid = xr.load_dataset('../../data/raw/GEBCO_24_Mar_2025_0cb80897321b/GEBCO_24_Mar_2025_0cb80897321b/gebco_2024_n62.1035_s27.0879_w-17.1562_e42.3281.nc')
lat_lon_pair = np.array((x_ar['lat'].values, x_ar['lon'].values)).T

depths = [depth_grid.sel(lon=lon, lat=lat, method="nearest")['elevation'].values for lat, lon in lat_lon_pair]
x_ar["depth"] = ("Id", depths)


In [41]:
lat_lon_match = np.array([[depth_grid.sel(lon=lon, lat=lat, method="nearest").lat.values, depth_grid.sel(lon=lon, lat=lat, method="nearest").lon.values] for lat, lon in lat_lon_pair])
np.sum((np.abs(lat_lon_pair- lat_lon_match)) >0.01)


0

#### Save preprocessed data 

In [42]:
if processed_data_dir.exists():
    shutil.rmtree(processed_data_dir)
processed_data_dir.mkdir(parents=True, exist_ok=True)

x_ar.to_netcdf(processed_data_dir/'x.nc')
y_ar.to_netcdf(processed_data_dir/'y.nc')

# Datasets creation

__input 1__
- __name__: log_rrs_lat_lon_month_season_depth_loc.csv
- __features__: log RRs, lat, lon, month, season, depth, Med, Black Sea, Med and Black Sea(OneHotEncoding)

__input 2__
- __name__: rrs_lat_lon_month_season_depth.csv
- __features__: RRs, lat, lon, month, season, depth,  Med, Black Sea, Med and Black Sea (OneHotEncoding)

__output 1__
- __name__: log_pigments.csv
- __features__: log pigments

__output 2__
- __name__: pigments.csv
- __features__: pigments

__output 3__
- __name__: th_pigments.csv
- __features__: pigments with a threshold of 5%



In [43]:
## Load back preprocessed data
x_ar = xr.load_dataset(processed_data_dir / 'x.nc')
y_ar = xr.load_dataset(processed_data_dir / 'y.nc')

dataset_dir.mkdir(parents=True, exist_ok=True)

In [44]:
##########     INPUT 1      ################
## Log_rrs
ds = pd.DataFrame()

log_rrs = x_ar['log_rrs'].values

ds[wavelengths] = log_rrs

## lat, lon
ds['lat'] = x_ar['lat']
ds['lon'] = x_ar['lon']

## month
ds[months_keys] = x_ar[months_keys].to_array().values.T

## season
ds['spring'] = x_ar['spring']
ds['winter'] = x_ar['winter']
ds['autumn'] = x_ar['autumn']
ds['summer'] = x_ar['summer']

## depth
ds['depth'] = x_ar['depth']

## location
ds['med'] = x_ar['Med'].astype(int)
ds['black sea'] = x_ar['Black Sea'].astype(int)
ds['med and black sea'] = x_ar['Med and Black Sea'].astype(int)

## Save dataset
name = 'log_rrs_lat_lon_month_season_depth_loc.csv'
ds.to_csv(dataset_dir /name, index=False)



In [45]:
##########     INPUT 2      ################
ds = pd.DataFrame()

## rrs
log_rrs = x_ar['rrs'].values

ds[wavelengths] = log_rrs

## lat, lon
ds['lat'] = x_ar['lat']
ds['lon'] = x_ar['lon']

## month
ds[months_keys] = x_ar[months_keys].to_array().values.T

## season
ds['spring'] = x_ar['spring']
ds['winter'] = x_ar['winter']
ds['autumn'] = x_ar['autumn']
ds['summer'] = x_ar['summer']

## depth
ds['depth'] = x_ar['depth']

## location
ds['med'] = x_ar['Med'].astype(int)
ds['black sea'] = x_ar['Black Sea'].astype(int)
ds['med and black sea'] = x_ar['Med and Black Sea'].astype(int)

## Save dataset
name = 'rrs_lat_lon_month_season_depth_loc.csv'
ds.to_csv(dataset_dir/ name, index=False)

In [46]:
###########    OUTPUT 1    ################
ds = pd.DataFrame()

## log pigments
ds[pigments] =  y_ar['log_pigments'].values

## Save dataset
name = 'log_pigments.csv'
ds.to_csv(dataset_dir/ name, index=False)

In [47]:
###########    OUTPUT 2    ################
ds = pd.DataFrame()

## log pigments
ds[pigments] =  y_ar['pigments'].values

## Save dataset
name = 'pigments.csv'
ds.to_csv(dataset_dir/ name, index=False)

In [48]:
###########    OUTPUT 3    ################
ds = pd.DataFrame()

## pigments
ds[pigments] = y_ar['th_pigments'].values

## Save dataset
name = 'th_pigments.csv'
ds.to_csv(dataset_dir/ name, index=False)

## Read Datasets

In [49]:
pd.read_csv(dataset_dir/name)

Unnamed: 0,chlide_a[mg*m^3],chla[mg*m^3],chlb[mg*m^3],chlc1+c2[mg*m^3],fucox[mg*m^3],19'hxfcx[mg*m^3],19'btfcx[mg*m^3],diadino[mg*m^3],allox[mg*m^3],diatox[mg*m^3],zeaxan[mg*m^3],beta_car[mg*m^3],peridinin[mg*m^3]
0,0.02330,0.7040,0.0544,0.0754,0.0577,0.1292,0.0425,0.0669,0.0681,0.0135,0.0721,0.0300,0.0124
1,0.00930,0.3988,0.0283,0.0391,0.0306,0.0925,0.0322,0.0384,0.0214,0.0182,0.0693,0.0183,0.0050
2,0.00510,0.1795,0.0091,0.0168,0.0147,0.0507,0.0149,0.0172,0.0020,0.0035,0.0441,0.0082,0.0022
3,0.00470,0.1851,0.0124,0.0171,0.0140,0.0533,0.0172,0.0184,0.0021,0.0053,0.0421,0.0115,0.0025
4,0.01710,0.5505,0.0958,0.0547,0.0590,0.0891,0.0376,0.0440,0.0414,0.0222,0.0435,0.0195,0.0075
...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.00248,0.0650,0.0030,0.0070,0.0030,0.0180,0.0110,0.0110,0.0010,0.0040,0.0160,0.0030,0.0010
181,0.00248,0.1290,0.0070,0.0190,0.0150,0.0300,0.0160,0.0200,0.0020,0.0030,0.0100,0.0040,0.0040
182,0.00248,0.0880,0.0040,0.0090,0.0050,0.0220,0.0110,0.0120,0.0010,0.0040,0.0140,0.0030,0.0010
183,0.00248,0.0820,0.0030,0.0090,0.0040,0.0220,0.0120,0.0120,0.0010,0.0070,0.0170,0.0030,0.0010


In [50]:
name = 'log_rrs_lat_lon_month_season_depth_loc.csv'
pd.read_csv(dataset_dir/name)

Unnamed: 0,400,412,442,490,510,560,620,665,673,681,...,November,December,spring,winter,autumn,summer,depth,med,black sea,med and black sea
0,-5.108140,-5.032818,-4.854141,-4.725271,-4.803107,-5.040214,-6.155717,-6.541807,-6.579596,-6.623231,...,0.0,0.0,0.0,1.0,0.0,0.0,-82,1,0,1
1,-5.260528,-5.234329,-5.148963,-5.100014,-5.254969,-5.637012,-6.957282,-7.374468,-7.405763,-7.456975,...,0.0,0.0,0.0,1.0,0.0,0.0,-353,1,0,1
2,-5.312197,-5.281124,-5.201425,-5.257310,-5.592267,-6.254191,-8.173213,-8.835541,-8.838639,-8.902635,...,0.0,0.0,0.0,1.0,0.0,0.0,-873,1,0,1
3,-5.143225,-5.134852,-5.099821,-5.205915,-5.551990,-6.221809,-7.853990,-8.380349,-8.427855,-8.487392,...,0.0,0.0,0.0,1.0,0.0,0.0,-504,1,0,1
4,-5.344345,-5.312536,-5.206167,-5.070675,-5.226125,-5.668222,-7.404925,-7.915476,-7.919566,-7.945734,...,0.0,0.0,0.0,1.0,0.0,0.0,-96,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,-5.570361,-5.078425,-4.971365,-5.058155,-5.475491,-6.418885,-8.352429,-8.775769,-8.763782,-9.011722,...,0.0,0.0,1.0,0.0,0.0,0.0,-1730,1,0,1
181,-5.713587,-5.190264,-5.045464,-5.049529,-5.400993,-6.183589,-7.955670,-8.355874,-8.351250,-8.512473,...,0.0,0.0,1.0,0.0,0.0,0.0,-66,1,0,1
182,-5.397095,-4.889877,-4.764209,-4.812132,-5.204164,-6.101185,-7.985244,-8.398476,-8.385315,-8.585318,...,0.0,0.0,1.0,0.0,0.0,0.0,-214,1,0,1
183,-5.027305,-4.536544,-4.442547,-4.534829,-4.944352,-5.859141,-7.772785,-8.179134,-8.153397,-8.354489,...,0.0,0.0,1.0,0.0,0.0,0.0,-358,1,0,1
