In [1]:
import os, sys
from netCDF4 import Dataset
import numpy as np
import zipfile 
import plotly.express as px
#import xarray as xr
sys.path.append('../../../../../Biologging_Toolkit/src/')
from Biologging_Toolkit.auxiliary import Api_ERA
from Biologging_Toolkit.utils import *
from collections import defaultdict
from glob import glob
from utils.data_reading.sound_data.station import StationsCatalog
import netCDF4 as nc
import numpy as np
import os
import pandas as pd
import re
import xarray as xr

/home/imonge/PycharmProjects/PythonProject/toolbox/src/notebooks/others/../../../../../Biologging_Toolkit/src/Biologging_Toolkit


In [None]:
# This notebook must have the notebook "Biologging_Toolkit" as Source Root (https://github.com/gmanatole/Biologging_Toolkit.git)
# Initialisation
personal_access_token = 'a0...f6' # Personal access is the API Token that can be found in your personal profile when logging in to copernicus.eu website
path = '/toolbox/data/test_correlation/ERA5_data_temp'  # Path for the downloaded data from ERA5

# Choice of the ERA5 data
variables = ['sea_surface_temperature']
years = ['2020','2021','2022', '2023', '2024']
months = 'all'
days = 'all'
hours = 'all'

In [None]:
catalog_path = "/media/imonge/CORSAIR" # Path of the hardware with MAHY data to retrieve the geographic coordinates of the hydrophones

# Dowloading ERA5 data by 6-month period format NetCDF (data is too large to be downloaded for more than 6 months at a time)
Api_ERA.make_cds_file(personal_access_token, path)
stations = StationsCatalog(catalog_path).by_dataset("MAHY0")  # on ne va chercher que MAHY00,01,02 et 04 (télécharger les données pour MAHY1*, 2*, 3* ou 4* serait redondant car les stations ont peu bougé)
# selected_stations = [s for s in stations if s.name =="MAHY04"] # If you need to select a given hydrophone

# Retrieve station boundaries
for station in stations:
    lat = station.get_pos()[0]
    lon = station.get_pos()[1]
    south_boundary = lat-0.01
    north_boundary = lat+0.01
    west_boundary = lon-0.01
    east_boundary = lon+0.01

    filename_base = station.name

# 6-month loop
    for year in years:
        for start_month in [1,7]:
            if start_month ==1:
                months = [1,2,3,4,5,6]
            else:
                months = [7,8,9,10,11,12]

            filename = f"{filename_base}_{year}_{start_month:02d}"
            full_path = os.path.join(path, filename + "nc")

            if os.path.exists(full_path):
                print(f"Déja téléchargé : {filename}")
                continue

            print(f"Téléchargement : {filename} | Année : {year} | Mois : {months}")

            Api_ERA.return_cdsbeta(filename, variables, [year], months, days, hours, [north_boundary, west_boundary, south_boundary, east_boundary])

In [6]:
base_dir_nc = "/toolbox/data/test_correlation/ERA5_data_temp"  # Path of the directory containing the NetCDF data
output_dir_nc = "/home/imonge/PycharmProjects/PythonProject/toolbox/data/test_correlation/ERA5_concat"  # Path of the directory for the concatenated NetCDF data

# Concatenation of NetCDF files
file_map = {
    "temperature": "data_stream-oper_stepType-instant.nc"
    # "waves": "data_stream-wave_stepType-instant.nc",
    # "wind": "data_stream-oper_stepType-instant.nc",
    # "rain": "data_stream-oper_stepType-accum.nc"
}

# Finding files with type MAHYxx_YYYY_MM
all_dirs = [d for d in os.listdir(base_dir_nc) if os.path.isdir(os.path.join(base_dir_nc, d))]
hydrophones_found = set()

# Extracting station id (MAHY01, MAHY02 ...)
pattern = re.compile(r"^(MAHY\d{2})_\d{4}_\d{2}$")

for dirname in all_dirs:
    match = pattern.match(dirname)
    if match:
        hydrophones_found.add(match.group(1))

hydrophones_sorted = sorted(hydrophones_found)
print(f"Detected hydrophones : {hydrophones_sorted}")

# Storing concatenated datasets
datasets_by_hydrophone = defaultdict(dict)

# For each hydrophone, concatenate wave, wind, rain
for hydrophone in hydrophones_sorted:
    for data_type, filename in file_map.items():
        pattern = os.path.join(base_dir_nc, f"{hydrophone}_*", filename)
        files = sorted(glob(pattern))

        if not files:
            print(f"No files found in {data_type} for {hydrophone}")
            continue

        try:
            ds = xr.open_mfdataset(files, combine="by_coords")
            datasets_by_hydrophone[hydrophone][data_type] = ds
            print(f"{data_type} : {len(files)} fichiers concatenated")
        except Exception as e:
            print(f"Error concatenation {data_type} for {hydrophone} : {e}")

# Merge and save concatenated files
for hydrophone, data_dict in datasets_by_hydrophone.items():
    if not data_dict:
        continue

    for data_type, ds in data_dict.items():

        try:
            output_filename = f"ERA5_*_{data_type}_merged.nc"
            output_path = os.path.join(output_dir_nc, output_filename)
            ds.to_netcdf(output_path)
            print(f"Merging saved : {output_path}")
        except Exception as e:
            print(f"Merging failed for  {hydrophone} :{e}")

Detected hydrophones : ['MAHY01', 'MAHY02', 'MAHY03', 'MAHY04']
temperature : 10 fichiers concatenated
temperature : 10 fichiers concatenated
temperature : 10 fichiers concatenated
temperature : 10 fichiers concatenated
Merging saved : /home/imonge/PycharmProjects/PythonProject/toolbox/data/test_correlation/ERA5_concat/ERA5_MAHY01_temperature_merged.nc
Merging saved : /home/imonge/PycharmProjects/PythonProject/toolbox/data/test_correlation/ERA5_concat/ERA5_MAHY02_temperature_merged.nc
Merging saved : /home/imonge/PycharmProjects/PythonProject/toolbox/data/test_correlation/ERA5_concat/ERA5_MAHY03_temperature_merged.nc
Merging saved : /home/imonge/PycharmProjects/PythonProject/toolbox/data/test_correlation/ERA5_concat/ERA5_MAHY04_temperature_merged.nc


In [None]:
# Saving NetCDF files with format
base_dir_csv = output_dir_nc # Path of the directory with the concatenated NetCDF data
output_dir_csv = os.path.join(base_dir_csv, "csv_outputs") # Path of the directory for the concatenated csv files
os.makedirs(output_dir_csv, exist_ok=True)

# Variables à extraire selon le type
var_map = {
    # "waves": ["mwp", "swh"],
    # "wind": ["u10", "v10"],
    # "rain": ["tp"]
}

for file in os.listdir(base_dir_csv):
    if not file.endswith(".nc"):
        continue

    for dtype in var_map:
        if f"_{dtype}_" in file:
            file_path = os.path.join(base_dir_csv, file)
            data = nc.Dataset(file_path)

            # Extraction des variables
            try:
                variables = var_map[dtype]
                df_data = {}
                for var in variables:
                    df_data[var] = np.array(data.variables[var])[:, 0, 0]

                # Gérer les timestamps
                dates = np.array(data.variables["valid_time"])
                datetime_index = pd.to_datetime(dates, unit='s')

                # Création du dataframe
                df = pd.DataFrame(df_data, index=datetime_index)
                df.index.name = 'date'

                # Définir nom de sortie
                csv_name = file.replace(".nc", ".csv")
                output_path = os.path.join(output_dir_csv, csv_name)

                # Sauvegarde
                df.to_csv(output_path)
                print(f"csv généré : {output_path}")

            except Exception as e:
                print(f"Erreur traitepent {file} :{e}")