In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
from utils.data_reading.sound_data.station import StationsCatalog
from scipy import signal
from utils.transformation.signal import butter_bandpass_filter
from tqdm.notebook import tqdm
import datetime

In [23]:
aplose_dir = "../../../../data/SPL/aplose.csv"
output_path = "../../../../data/SPL"
catalog_path = "/media/imonge/CORSAIR"
stations = StationsCatalog(catalog_path)

In [24]:
# Change csv in pkl format
df = pd.read_csv(aplose_dir, parse_dates=["start_datetime", "end_datetime"])
df.to_pickle(os.path.join(output_path,'aplose.pkl'))
df = pd.read_pickle('../../../../data/SPL/aplose.pkl')

In [25]:
# Rename labels
labels_info = {
    # Whales
    "Dcall": ('BW_dcall', 'Blue Whale D-call'),
    "Antarctic blue whale song": ('ABW', 'Antarctic blue whale'),
    "Australian pygmy blue whale song": ('PBW_SEIO', 'Australian pygmy blue whale'),
    "Madagascan pygmy blue whale song": ('PBW_SWIO', 'Madagascan pygmy blue whale'),
    "Sri Lanka pygmy blue whale song": ('PBW_CIO', 'Sri Lankan pygmy blue whale'),
    "Omura Australia (19-25 Hz)": ('Omura_SEIO', "Australian Omura's whale"),
    "Omura DGC LF (20 Hz)": ('Omura_SWIO', "Madagascan Omura's whale"),
    "Omura DGC HF (30-40Hz)": ('Omura_CIO', "Diego Garcian Omura's whale"),
    "Minke whale": ('Minke', 'Minke whale'),
    "Fin whale 40 Hz": ('FW_nsp', 'Fin whale non stereotyped pulse'),
    "Fin whale 20 Hz": ('FW_20Hz', 'Fin whale 20-Hz pulse'),
    # Whales ind
    "P-call": ('P_call', 'P-call'),
    "Ind 42 Hz": ('ind_42Hz', 'Indeterminate species - 42 Hz'),
    "LF 8 sec pulse": ('ind_8s', 'Indeterminate species -  8 sec'),

    # Geophony
    "T-wave": ('eq', 'earthquake'),
    "impulse_geo": ('impuls_geo', 'Impulsive volcanic event'),

    # Anthropophony
    "career_shot": ('career_shot', 'Career shot'),
    "Airgun": ('seismic_shot', 'Seismic shot'),
    "ship_noise": ('ship_noise', 'Ship noise'),
    "anthropophony": ('ind_anthro', 'Indeterminate anthropophony')
}

# retrieve "short name"
short_name = {k: v[0] for k, v in labels_info.items()}

# Change label names
df['annotation'] = df['annotation'].replace(short_name)

# # Retrieve "long name"
# long_name = {v[0]: v[1] for v in labels_info.items()}
# df['long_name'] = df['annotation'].map(long_name)

In [26]:
# Remove "weak" lines (except for ship_noise)
df = df[~((df['type'] == 'weak') & (df['annotation'] != 'ship_noise'))]

In [27]:
# Remove lines with only a comment
df = df[df['annotation'].notna()]

In [28]:
# Remove umpty columns
columns_to_delete = ['signal_quality', 'signal_start_frequency', 'signal_end_frequency', 'signal_relative_max_frequency_count', 'signal_relative_min_frequency_count', 'signal_has_harmonics', 'signal_trend', 'signal_steps_count', 'annotator_expertise']
df = df.drop(columns=columns_to_delete)

In [29]:
example_line = df.iloc[0]
possible_stations = stations.by_date(example_line["start_datetime"].tz_localize(None))  # [MAHY11, MAHY13, MAHY14]
station_number = example_line["filename"].split("_")[1][-1]  # "1"
station_number_to_station = {s.name[-1]:s for s in possible_stations}  # {'1':MAHY11, '3':MAHY13, '4':MAHY14}
station = station_number_to_station[station_number]  # MAHY11

In [40]:
psds = []
index = []
nperseg = 512
nperseg_sec = 515/240

# Retrieve station name from the available data in the aplose dataset
for i in tqdm(df.index):
    line = df.iloc[i]
    possible_stations = stations.by_date(line["start_datetime"].tz_localize(None))  # [MAHY11, MAHY13, MAHY14]
    station_number = line["filename"].split("_")[1][-1]  # "1"
    station_number_to_station = {s.name[-1]:s for s in possible_stations}  # {'1':MAHY11, '3':MAHY13, '4':MAHY14}
    station = station_number_to_station[station_number]  # MAHY11

    # Compute PSDs on the annotation boxes
    manager = station.get_manager()

    date_start = line['start_datetime'].tz_localize(None)
    date_end = line['end_datetime'].tz_localize(None)
    delta = (date_end - date_start).total_seconds()

    # Widen boxes which areto short
    if delta < nperseg_sec:
        diff = nperseg_sec - delta
        date_end = date_end + datetime.timedelta(seconds=diff)

    # Frequency range when we have weak annotations ('ship_noise')
    if line['is_box'] == 0:
        start_frequency = 1e-3 # does not tolerate strict 0 Hz
        end_frequency=120 - 1e-3 # does not tolerate strict 120 Hz
    else:
        start_frequency = line['start_frequency'] + 1e-3
        end_frequency = line['end_frequency'] - 1e-3

    data = manager.get_segment(date_start, date_end)
    data = butter_bandpass_filter(data, start_frequency, end_frequency, manager.sampling_f)  # on filtre entre 40 et 45 Hz, en spécifiant la fréquence d'échantillonage (on pourrait mettre 240 "en dur" aussi)

    psd_frequencies, psd = signal.welch(data, fs=240, nperseg=nperseg) # methode pour avoir les pics spl
    psds.append(psd)
    index.append(i)

# Save computed psds in a dataframe
df_psd = pd.DataFrame(psds, index=index, columns=psd_frequencies)
df_psd.to_pickle(os.path.join(output_path, 'psd_aplose.pkl'))

# Si on prend moyenne -> rms
# si on prend max -> peak frequency

  0%|          | 0/6063 [00:00<?, ?it/s]

0


In [None]:
# Add columns for the peak frequency
## Ongoing work ##