### This module computes the SPL and 0-to-peak on each annotation box
The input file is **cleaned 'annotation result'** database 'aplose_cleaned'.<br>
The output file is the dataframe 'df_segments_updated' with additional columns 'signal', 'SPL_db', 'SEL_db', 'zero_to_peak', 'duration', 'mean_start_frequency', and 'mean_end_frequency'.

In [1]:
import pandas as pd
import os
from utils.data_reading.sound_data.station import StationsCatalog
from tqdm.notebook import tqdm
import numpy as np

In [2]:
aplose_dir = "../../../../../data/SPL/aplose_cleaned.pkl" # clean pkl dataframe
output_path = "../../../../../data/SPL"
catalog_path = "/media/imonge/CORSAIR"
stations = StationsCatalog(catalog_path)
df = pd.read_pickle(aplose_dir)

In [None]:
### Preparation ###

In [3]:
# Compute PSDs on segments
segments = []
index = []

# Retrieve station name from the available data in the aplose dataset
for i in tqdm(df.index):
    line = df.loc[i]
    start_time = line['start_datetime'].tz_localize(None)
    end_time = line['end_datetime'].tz_localize(None)

    possible_stations = stations.by_date(start_time)  # [MAHY11, MAHY13, MAHY14]
    station_number = line["filename"].split("_")[1][-1]  # "1"
    station_number_to_station = {s.name[-1]:s for s in possible_stations}  # {'1':MAHY11, '3':MAHY13, '4':MAHY14}
    station = station_number_to_station.get(station_number) # MAHY11

    if station is None:
        print(f"Station{station_number} not found for {line['filename']}")
        continue

    manager = station.get_manager()

    try:
        segment = manager.get_segment(start_time, end_time)
    except Exception as e:
        print(f" Error charging segment {i}: e")
        continue

    segments.append({
        "signal": segment,
        "sampling_rate": manager.sampling_f,
        "start_datetime": line["start_datetime"],
        "end_datetime": line["end_datetime"],
        "station": station.name,
        "label": line["annotation"],
        "start_frequency":line["start_frequency"],
        "end_frequency": line["end_frequency"],
        "type": line["type"],
        "filename": line["filename"],
        "confidence": line["confidence_indicator_label"]
    })
    index.append(i)

# Save computed psds in a dataframe
df_segments = pd.DataFrame(segments, index=index)
df_segments.to_pickle(os.path.join(output_path, 'df_segments.pkl'))

  0%|          | 0/5419 [00:00<?, ?it/s]

In [None]:
### Add new columns to dataframe ###

In [4]:
### Compute SPL (SPL = Lp = Lp,rms)
def compute_SPL(signal):
    """
    Calculate mean square SPL, result in dB re 1 µPa²
    Ainslie et al., 2022
    """
    mean_square = np.mean(signal**2)
    if mean_square == 0:
        return -np.inf # silence
    return 10 * np.log10(mean_square) # SPL = (mean_square/p0**2) in the formula but p0 = 1µPa

# Add column for SPL
df_segments['SPL_db'] = df_segments['signal'].apply(compute_SPL)

In [5]:
### Compute cumulated sound exposure levels unweighted (SEL)
def compute_SEL(signal, sampling_rate):
    """
    Calculate unweighted SEL, result in dB re 1 µPa²
    Ainslie et al., 2022
    """
    dt = 1 /sampling_rate
    E = np.sum(signal**2) * dt
    if E ==0:
        return -np.inf
    return 10 * np.log10(E) # SEL = 10 * np.log10(E / (p0**2 * t0)) avec p0 = 1µPa et t0 = 1s

# Add column for SEL unweighted
df_segments['SEL_db'] = df_segments.apply(
    lambda row: compute_SEL(row['signal'], row['sampling_rate']), axis=1)

In [6]:
### Compute 0-to-peak sound pressure level (Lp, 0-pk)
def compute_zero_to_pk(signal):
    """
    Calculate zero-to-peak sound pressure, result in dB re 1 µPa²
    Ainslie et al., 2022
    """
    ppk = np.max(np.abs(signal)) # peak value
    if ppk == 0:
        return -np.inf # null signal -> log(0)
    return 10 * np.log10(ppk**2) # L0pk = 10 * np.log10(ppk**2 / p0**2) avec p0 = 1µPa

df_segments['zero_to_pk_db'] = df_segments["signal"].apply(compute_zero_to_pk)

In [7]:
### Add a signal duration column
# Convert columns to datetime
df_segments['start_datetime'] = pd.to_datetime(df_segments['start_datetime'])
df_segments['end_datetime'] = pd.to_datetime(df_segments['end_datetime'])

# Compute duration (except for ship noise)
df_segments['duration'] = None # initialization
mask = df_segments['label'] != 'ship_noise'
df_segments.loc[mask, 'duration'] = (df_segments.loc[mask, 'end_datetime'] - df_segments.loc[mask, 'start_datetime']).dt.total_seconds()

In [8]:
### Add the mean_start_frequency and the mean_end_frequency
# Mean by label
mean_freq = df_segments.groupby('label')[['start_frequency', 'end_frequency']].mean()
mean_freq = mean_freq.rename(columns={
    'start_frequency': 'mean_start_frequency',
    'end_frequency': 'mean_end_frequency',
})
# Merge with dataframe
df_segments = df_segments.merge(mean_freq, on='label')

In [9]:
### Add column 'species'
def map_species(label):
    if label.startswith('Omura'):
        return 'omura'
    elif label[:2] == 'FW':
        return 'fin_whale'
    elif 'Minke' in label:
        return 'minke'
    elif label[0:2] == 'BW' or label[1:3] == 'BW':
        return 'blue_whale'
    elif label.startswith('P_call') or label.startswith('ind_42Hz') or label.startswith('ind_8s'):
        return 'ind'
    else:
        return np.nan

df_segments['species'] = df_segments['label'].apply(map_species)

In [10]:
# Save df_segments
df_segments.to_pickle(os.path.join(output_path, 'df_segments_updated.pkl'))

In [16]:
df_segments[df_segments['label'] == 'ship_noise']

Unnamed: 0,signal,sampling_rate,start_datetime,end_datetime,station,label,start_frequency,end_frequency,type,filename,confidence,SPL_db,SEL_db,zero_to_pk_db,duration,mean_start_frequency,mean_end_frequency,species
56,"[-937042.6875462213, -863974.1254842329, -5676...",240.0,2021-08-15 16:06:18+00:00,2021-08-15 16:16:18+00:00,MAHY11,ship_noise,0.0,120.0,WEAK,00838_MAHY1_20210815_160618.wav,Sure,112.023905,139.805327,123.801765,,0.0,120.0,
67,"[-1895008.9440373676, -1718037.923156712, -157...",240.0,2021-08-16 07:35:59+00:00,2021-08-16 07:45:59+00:00,MAHY11,ship_noise,0.0,120.0,WEAK,01496_MAHY1_20210816_073559.wav,Sure,113.703171,141.484593,126.914176,,0.0,120.0,
70,"[448973.21725354524, 167970.183366192, 314861....",240.0,2021-08-19 02:38:46+00:00,2021-08-19 02:48:46+00:00,MAHY13,ship_noise,0.0,120.0,WEAK,01079_MAHY3_20210819_023846.wav,Sure,112.179241,139.960693,125.237080,,0.0,120.0,
72,"[483024.83517473395, 883357.1671121997, 661108...",240.0,2021-08-20 10:29:27+00:00,2021-08-20 10:39:27+00:00,MAHY14,ship_noise,0.0,120.0,WEAK,01226_MAHY4_20210820_102927.wav,Sure,115.480670,143.262092,128.019856,,0.0,120.0,
77,"[1185068.3732099168, 1133117.1438005886, 99114...",240.0,2021-08-21 01:08:05+00:00,2021-08-21 01:18:05+00:00,MAHY13,ship_noise,0.0,120.0,WEAK,01595_MAHY3_20210821_010805.wav,Sure,112.835312,140.616734,126.577684,,0.0,120.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5414,"[1213147.6754758859, 1238540.2963343253, 13713...",240.0,2024-09-04 22:56:06+00:00,2024-09-04 23:06:06+00:00,MAHY43,ship_noise,0.0,120.0,WEAK,01127_MAHY3_20240904_225606.wav,Sure,111.360309,139.141731,124.083464,,0.0,120.0,
5415,"[409069.39004213345, 310737.7613097829, 252180...",240.0,2024-09-11 04:31:20+00:00,2024-09-11 04:41:20+00:00,MAHY43,ship_noise,0.0,120.0,WEAK,01242_MAHY3_20240911_043120.wav,Sure,112.260670,140.042092,125.084105,,0.0,120.0,
5416,"[778946.4954186922, 927933.8116798701, 636083....",240.0,2024-09-11 09:20:50+00:00,2024-09-11 09:30:50+00:00,MAHY41,ship_noise,0.0,120.0,WEAK,01640_MAHY1_20240911_092050.wav,Sure,109.540405,137.321827,122.289855,,0.0,120.0,
5417,"[-490493.99359672423, -192778.4694505288, -287...",240.0,2024-09-15 09:01:38+00:00,2024-09-15 09:11:38+00:00,MAHY41,ship_noise,0.0,120.0,WEAK,01013_MAHY1_20240915_090138.wav,Sure,110.408427,138.189849,123.077013,,0.0,120.0,
