In [37]:
import pandas as pd
import os
from utils.data_reading.sound_data.station import StationsCatalog
from scipy import signal
from utils.transformation.signal import butter_bandpass_filter
from tqdm.notebook import tqdm
import datetime
import numpy as np

In [42]:
# aplose_dir = "../../../../../data/SPL/df_segments_updated.pkl"
output_path = "../../../../../data/SPL"
catalog_path = "/media/imonge/CORSAIR"
stations = StationsCatalog(catalog_path)
df = pd.read_pickle('../../../../../data/SPL/aplose_not_annotated_cleaned.pkl')

In [3]:
### Aplose PSD computing

In [None]:
example_line = df.iloc[0]
possible_stations = stations.by_date(example_line["start_datetime"].tz_localize(None))  # [MAHY11, MAHY13, MAHY14]
station_number = example_line["filename"].split("_")[1][-1]  # "1"
station_number_to_station = {s.name[-1]:s for s in possible_stations}  # {'1':MAHY11, '3':MAHY13, '4':MAHY14}
station = station_number_to_station[station_number]  # MAHY11

In [None]:
# ### Compute psds for each annotation (dataframe lines) and between mean start and end frequencies
#
# psds = []
# index = []
# nperseg = 512
# nperseg_sec = 515/240
#
#
# for i in tqdm(range(len(df))):
#     line = df.iloc[i]
#
#     # Retrieve station name from the available data in the aplose dataset
#     possible_stations = stations.by_date(line["start_datetime"].tz_localize(None))  # [MAHY11, MAHY13, MAHY14]
#     station_number = line["filename"].split("_")[1][-1]  # "1"
#     station_number_to_station = {s.name[-1]:s for s in possible_stations}  # {'1':MAHY11, '3':MAHY13, '4':MAHY14}
#     station = station_number_to_station[station_number]  # MAHY11
#
#     # Compute PSDs on the annotation boxes
#     manager = station.get_manager()
#
#     date_start = line['start_datetime'].tz_localize(None)
#     date_end = line['end_datetime'].tz_localize(None)
#     delta = (date_end - date_start).total_seconds()
#
#     # Widen boxes which are too short
#     if delta < nperseg_sec:
#         diff = nperseg_sec - delta
#         date_end = date_end + datetime.timedelta(seconds=diff)
#
#     # Frequency range when we have weak annotations ('ship_noise')
#     if line['type'] == 'WEAK':
#         start_frequency = 1e-3 # does not tolerate strict 0 Hz
#         end_frequency=120 - 1e-3 # does not tolerate strict 120 Hz
#     else:
#         start_frequency = line['mean_start_frequency']
#         end_frequency = line['mean_end_frequency']
#
#     data = manager.get_segment(date_start, date_end)
#     # data = butter_bandpass_filter(data, start_frequency, end_frequency, manager.sampling_f)  # on filtre entre 40 et 45 Hz, en spécifiant la fréquence d'échantillonage (on pourrait mettre 240 "en dur" aussi)
#
#
#     psd_frequencies, psd = signal.welch(data, fs=240, nperseg=nperseg) # Frequency bins in psd_frequencies, values in psd
#
#     # Filter between mean start and end frequencies. Nan values when out of bounds
#     psd_filtered = np.where((psd_frequencies >= start_frequency) & (psd_frequencies <= end_frequency), psd, np.nan)
#
#     psds.append(psd_filtered)
#     index.append(i)
#
# # Save computed psds in a dataframe
# df_psd = pd.DataFrame(psds, index=df.index, columns=psd_frequencies)
# df_psd.to_pickle(os.path.join(output_path, 'psd_aplose_fq_filtered.pkl'))
#
# # Avec ce script on pourra savoir a quelles fréquences sont les sons et sur quelle fréquence le signal est le plus puissant

In [None]:
### Compute psds for each annotation (dataframe lines) and between mean start and end frequencies - remove background noise

psds_cleaned = []
index = []
nperseg = 512
nperseg_sec = 515/240
segment_duration = 200


for i in tqdm(range(len(df))):
    line = df.iloc[i]

    try:
        ## Station
        # Retrieve station name from the available data in the aplose dataset
        possible_stations = stations.by_date(line["start_datetime"].tz_localize(None))  # [MAHY11, MAHY13, MAHY14]
        station_number = line["filename"].split("_")[1][-1]  # "1"
        station_number_to_station = {s.name[-1]:s for s in possible_stations}  # {'1':MAHY11, '3':MAHY13, '4':MAHY14}
        station = station_number_to_station[station_number]  # MAHY11

        # Compute PSDs on the annotation boxes
        manager = station.get_manager()

        ## Date
        date_start = line['start_datetime'].tz_localize(None)
        date_end = line['end_datetime'].tz_localize(None)
        delta = (date_end - date_start).total_seconds()

        # Widen boxes which are too short
        if delta < nperseg_sec:
            diff = nperseg_sec - delta
            date_end = date_end + datetime.timedelta(seconds=diff)

        ## Frequencies
        # Frequency range when we have weak annotations ('ship_noise')
        if line['type'] == 'WEAK':
            start_frequency = 1e-3 # does not tolerate strict 0 Hz
            end_frequency=120 - 1e-3 # does not tolerate strict 120 Hz
        else:
            start_frequency = line['mean_start_frequency']
            end_frequency = line['mean_end_frequency']

        ## Main PSD (inside annotation box)
        data_signal = manager.get_segment(date_start, date_end)
        f, psd_signal = signal.welch(data_signal, fs=240, nperseg=nperseg)
        psd_signal_filtered = np.where((f >= start_frequency) & (f <= end_frequency), psd_signal, np.nan)

        ## Background PSD (200s before and after signal)
        segment_before_start = date_start - datetime.timedelta(seconds=segment_duration)
        segment_before_end = date_start

        segment_after_start = date_end
        segment_after_end = date_end + datetime.timedelta(seconds=segment_duration)

        data_before = manager.get_segment(segment_before_start, segment_before_end)
        data_after = manager.get_segment(segment_after_start, segment_after_end)

        _, psd_before = signal.welch(data_before, fs=240, nperseg=nperseg)
        _, psd_after = signal.welch(data_after, fs=240, nperseg=nperseg)

        ## Mean of both segments
        psd_background = (psd_before + psd_after) / 2

        # Filtering between start/end frequency
        psd_background_filtered = np.where((f >= start_frequency) & (f <= end_frequency), psd_background, np.nan)

        # Substraction
        psd_cleaned = psd_signal_filtered - psd_background_filtered

        # Replace negative values by zeros
        psd_cleaned = np.where(psd_cleaned < 0, 0, psd_cleaned)

        # Stocking
        psds_cleaned.append(psd_cleaned)
        index.append(i)

    except Exception as e:
        # if segment not available, skip
        print(f"[WARNING] Segment {i} failed: {e}")
        psds_cleaned.append(np.full_like(psd_signal, np.nan))
        index.append(i)
        continue

# Save computed psds in a dataframe
df_psd_cleaned = pd.DataFrame(psds_cleaned, index=df.index, columns=f)
df_psd_cleaned.to_pickle(os.path.join(output_path, 'psd_aplose_fq_filtered_no_background.pkl'))



# Avec ce script on pourra savoir a quelles fréquences sont les sons et sur quelle fréquence le signal est le plus puissant

In [43]:
### Compute psds on the whole 10min time window for the dataset with or without ship_noise (change input data and output name -> ship_only / no_ship)

psds = []
index = []
nperseg = 512
nperseg_sec = 515/240


for i in tqdm(range(len(df))):
    line = df.iloc[i]

    # Retrieve station name from the available data in the aplose dataset
    possible_stations = stations.by_date(line["start_datetime"].tz_localize(None))  # [MAHY11, MAHY13, MAHY14]
    station_number = line["filename"].split("_")[1][-1]  # "1"
    station_number_to_station = {s.name[-1]:s for s in possible_stations}  # {'1':MAHY11, '3':MAHY13, '4':MAHY14}
    station = station_number_to_station[station_number]  # MAHY11

    # Compute PSDs on the annotation boxes
    manager = station.get_manager()

    date_start = line['start_datetime'].tz_localize(None)
    date_end = line['end_datetime'].tz_localize(None)
    delta = (date_end - date_start).total_seconds()

    # Frequency range when we have weak annotations ('ship_noise')

    start_frequency = 5 # does not tolerate strict 0 Hz
    end_frequency=120 - 1e-3 # does not tolerate strict 120 Hz

    data = manager.get_segment(date_start, date_end)
    # data = butter_bandpass_filter(data, start_frequency, end_frequency, manager.sampling_f)  # on filtre entre 40 et 45 Hz, en spécifiant la fréquence d'échantillonage (on pourrait mettre 240 "en dur" aussi)


    psd_frequencies, psd = signal.welch(data, fs=240, nperseg=nperseg) # Frequency bins in psd_frequencies, values in psd

    # Filter between mean start and end frequencies. Nan values when out of bounds
    psd_filtered = np.where((psd_frequencies >= start_frequency) & (psd_frequencies <= end_frequency), psd, np.nan)

    psds.append(psd_filtered)
    index.append(line['start_datetime'])

# Save computed psds in a dataframe
df_psd = pd.DataFrame(psds, index=index, columns=psd_frequencies)
df_psd.to_pickle(os.path.join(output_path, 'psd_aplose_no_ship.pkl'))

# NB: changer le output name de 'ship_only' par 'with_ship' car ce ne sont pas les spectro avec QUE du ship noise

  0%|          | 0/32 [00:00<?, ?it/s]