### This notebook is used for cleaning Aplose database.
The input files are one Aplose file 'Annotation results' and one Aplose file 'Annotation status'.

The output files are :
- df_annot: a dataset with all **annotations** (columns are: 'filename', 'start_datetime', 'end_datetime', 'start_frequency', 'end_frequency', etc.)
- df_ship: a dataset with **ship annotations** with one ship annotation per spectro (columns are the same as df_annot)
- df_not_annotated: a dataframe with **not annotated spectrograms** (columns are 'dataset', 'filename', 'start_datetime', 'end_datetime')

In [1]:
import pandas as pd
import os
import re
from datetime import datetime, timedelta
import pytz

In [2]:
aplose_result_dir = "../../../../../data/SPL/aplose.csv"
aplose_status_dir = "../../../../../data/SPL/aplose_wth_umpty_spectro.csv"
output_path = "../../../../../data/SPL"

In [3]:
# Change csv in pkl format
df_annot = pd.read_csv(aplose_result_dir, parse_dates=["start_datetime", "end_datetime"])
df_annot.to_pickle(os.path.join(output_path,'aplose.pkl'))
df_annot = pd.read_pickle('../../../../../data/SPL/aplose.pkl')

In [4]:
# Rename labels
labels_info = {
    # Whales
    "Dcall": ('BW_dcall', 'Blue Whale D-call'),
    "Antarctic blue whale song": ('ABW', 'Antarctic blue whale'),
    "Australian pygmy blue whale song": ('PBW_SEIO', 'Australian pygmy blue whale'),
    "Madagascan pygmy blue whale song": ('PBW_SWIO', 'Madagascan pygmy blue whale'),
    "Sri Lanka pygmy blue whale song": ('PBW_CIO', 'Sri Lankan pygmy blue whale'),
    "Omura Australia (19-25 Hz)": ('Omura_SEIO', "Australian Omura's whale"),
    "Omura DGC LF (20 Hz)": ('Omura_SWIO', "Madagascan Omura's whale"),
    "Omura DGC HF (30-40Hz)": ('Omura_CIO', "Diego Garcian Omura's whale"),
    "Minke whale": ('Minke', 'Minke whale'),
    "Fin whale 40 Hz": ('FW_nsp', 'Fin whale non stereotyped pulse'),
    "Fin whale 20 Hz": ('FW_20Hz', 'Fin whale 20-Hz pulse'),
    # Whales ind
    "P-call": ('P_call', 'P-call'),
    "Ind 42 Hz": ('ind_42Hz', 'Indeterminate species - 42 Hz'),
    "LF 8 sec pulse": ('ind_8s', 'Indeterminate species -  8 sec'),

    # Geophony
    "T-wave": ('eq', 'earthquake'),
    "impulse_geo": ('impuls_geo', 'Impulsive volcanic event'),

    # Anthropophony
    "career_shot": ('career_shot', 'Career shot'),
    "Airgun": ('seismic_shot', 'Seismic shot'),
    "ship_noise": ('ship_noise', 'Ship noise'),
    "anthropophony": ('ind_anthro', 'Indeterminate anthropophony')
}

# retrieve "short name"
short_name = {k: v[0] for k, v in labels_info.items()}

# Change label names
df_annot['annotation'] = df_annot['annotation'].replace(short_name)

# # Retrieve "long name"
# long_name = {v[0]: v[1] for v in labels_info.items()}
# df['long_name'] = df['annotation'].map(long_name)

Clean aplose dataset - Annotation results

In [5]:
# Remove "WEAK" lines (except for ship_noise)
df_annot = df_annot[~((df_annot['type'] == 'WEAK') & (df_annot['annotation'] != 'ship_noise'))]

In [6]:
# Remove lines with only a comment
df_annot = df_annot[df_annot['annotation'].notna()]

In [7]:
# Remove line from the file that is the transition between two moorings (no annotation but the hydrophone is moving -> we can't see anything)
df_annot = df_annot[df_annot['filename'] != "01682_MAHY1_20240925_103740.wav"]

In [8]:
# Remove umpty columns
columns_to_delete = ['signal_quality', 'signal_start_frequency', 'signal_end_frequency', 'signal_relative_max_frequency_count', 'signal_relative_min_frequency_count', 'signal_has_harmonics', 'signal_trend', 'signal_steps_count', 'annotator_expertise']
df_annot = df_annot.drop(columns=columns_to_delete)

In [9]:
# Save dataframe
df_annot.to_pickle(os.path.join(output_path, 'aplose_cleaned.pkl'))

In [10]:
df_annot

Unnamed: 0,dataset,filename,result_id,is_update_of_id,start_time,end_time,start_frequency,end_frequency,annotation,annotator,start_datetime,end_datetime,is_box,type,confidence_indicator_label,confidence_indicator_level,comments,created_at_phase
1,MAHY_2k_random_v2,00838_MAHY1_20210815_160618.wav,434470.0,,68.061655,70.072449,59.0,91.0,ind_8s,MongeInes,2021-08-15 16:07:26.061000+00:00,2021-08-15 16:07:28.072000+00:00,1.0,BOX,Sure,1/1,,ANNOTATION
2,MAHY_2k_random_v2,00838_MAHY1_20210815_160618.wav,434471.0,,76.551674,78.711416,56.0,85.0,ind_8s,MongeInes,2021-08-15 16:07:34.551000+00:00,2021-08-15 16:07:36.711000+00:00,1.0,BOX,Sure,1/1,,ANNOTATION
3,MAHY_2k_random_v2,00838_MAHY1_20210815_160618.wav,434472.0,,85.339589,87.797226,61.0,107.0,ind_8s,MongeInes,2021-08-15 16:07:43.339000+00:00,2021-08-15 16:07:45.797000+00:00,1.0,BOX,Sure,1/1,,ANNOTATION
4,MAHY_2k_random_v2,00838_MAHY1_20210815_160618.wav,434473.0,,93.308291,96.063824,58.0,106.0,ind_8s,MongeInes,2021-08-15 16:07:51.308000+00:00,2021-08-15 16:07:54.063000+00:00,1.0,BOX,Sure,1/1,,ANNOTATION
5,MAHY_2k_random_v2,00838_MAHY1_20210815_160618.wav,434474.0,,119.001771,120.491248,57.0,105.0,ind_8s,MongeInes,2021-08-15 16:08:17.001000+00:00,2021-08-15 16:08:18.491000+00:00,1.0,BOX,Sure,1/1,,ANNOTATION
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6090,MAHY_2k_random_v2,01127_MAHY3_20240904_225606.wav,448178.0,,0.000000,600.000000,0.0,120.0,ship_noise,MongeInes,2024-09-04 22:56:06+00:00,2024-09-04 23:06:06+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
6091,MAHY_2k_random_v2,01242_MAHY3_20240911_043120.wav,448179.0,,0.000000,600.000000,0.0,120.0,ship_noise,MongeInes,2024-09-11 04:31:20+00:00,2024-09-11 04:41:20+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
6092,MAHY_2k_random_v2,01640_MAHY1_20240911_092050.wav,448180.0,,0.000000,600.000000,0.0,120.0,ship_noise,MongeInes,2024-09-11 09:20:50+00:00,2024-09-11 09:30:50+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
6093,MAHY_2k_random_v2,01013_MAHY1_20240915_090138.wav,448181.0,,0.000000,600.000000,0.0,120.0,ship_noise,MongeInes,2024-09-15 09:01:38+00:00,2024-09-15 09:11:38+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION


Create Dataset with only ship annotations

In [11]:
# Dataframe with only ship_noise
df_ship = df_annot[df_annot['annotation'] == "ship_noise"]

In [12]:
# Save dataframe
df_ship.to_pickle(os.path.join(output_path, 'aplose_cleaned_ship.pkl'))

In [13]:
df_ship

Unnamed: 0,dataset,filename,result_id,is_update_of_id,start_time,end_time,start_frequency,end_frequency,annotation,annotator,start_datetime,end_datetime,is_box,type,confidence_indicator_label,confidence_indicator_level,comments,created_at_phase
57,MAHY_2k_random_v2,00838_MAHY1_20210815_160618.wav,507658.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2021-08-15 16:06:18+00:00,2021-08-15 16:16:18+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
70,MAHY_2k_random_v2,01496_MAHY1_20210816_073559.wav,442626.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2021-08-16 07:35:59+00:00,2021-08-16 07:45:59+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
74,MAHY_2k_random_v2,01079_MAHY3_20210819_023846.wav,446809.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2021-08-19 02:38:46+00:00,2021-08-19 02:48:46+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
77,MAHY_2k_random_v2,01226_MAHY4_20210820_102927.wav,446812.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2021-08-20 10:29:27+00:00,2021-08-20 10:39:27+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
85,MAHY_2k_random_v2,01595_MAHY3_20210821_010805.wav,446813.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2021-08-21 01:08:05+00:00,2021-08-21 01:18:05+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6090,MAHY_2k_random_v2,01127_MAHY3_20240904_225606.wav,448178.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2024-09-04 22:56:06+00:00,2024-09-04 23:06:06+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
6091,MAHY_2k_random_v2,01242_MAHY3_20240911_043120.wav,448179.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2024-09-11 04:31:20+00:00,2024-09-11 04:41:20+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
6092,MAHY_2k_random_v2,01640_MAHY1_20240911_092050.wav,448180.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2024-09-11 09:20:50+00:00,2024-09-11 09:30:50+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION
6093,MAHY_2k_random_v2,01013_MAHY1_20240915_090138.wav,448181.0,,0.0,600.0,0.0,120.0,ship_noise,MongeInes,2024-09-15 09:01:38+00:00,2024-09-15 09:11:38+00:00,0.0,WEAK,Sure,1/1,,ANNOTATION


Clean aplose dataset - Annotation status (with unannotated spectrograms)

In [14]:
# Change csv in pkl format
df_all_spectro = pd.read_csv(aplose_status_dir)
df_all_spectro.to_pickle(os.path.join(output_path,'aplose_wth_umpty_spectro.pkl'))
df_all_spectro = pd.read_pickle('../../../../../data/SPL/aplose_wth_umpty_spectro.pkl')

In [15]:
# Remove 'UNASSIGNED' lines
df_all_spectro = df_all_spectro[df_all_spectro['MongeInes'] != 'UNASSIGNED']
# print(df1['MongeInes'].unique())

Create Dataset without ship annotations

In [16]:
# Cross both datasets to have the dataframe of unannotated spectrograms
df_no_ship = df_all_spectro[~df_all_spectro['filename'].isin(df_ship['filename'])].copy()

In [18]:
## Start datetime
# Extract date from the filename
def extract_datetime(filename):
    match = re.search(r'(\d{8}_\d{6})', filename)
    if match:
        raw = match.group(1) # 'YYYYMMDD_hhmmss'
        dt = datetime.strptime(raw, "%Y%m%d_%H%M%S")
        dt_utc = dt.replace(tzinfo=pytz.UTC)
        return dt_utc # return dt.strftime('%Y-%m-%d %H:%M:%S+00:00')# Format 'YYYY-MM-DD HH:MM:SS+00:00'
    else:
        return None

# Apply function to each line
df_no_ship['start_datetime'] = df_no_ship['filename'].apply(extract_datetime)

In [19]:
## End datetime
# Extract start datetime
df_no_ship['start_datetime'] = df_no_ship['filename'].apply(extract_datetime)
# Add 10 min
df_no_ship['end_datetime'] = df_no_ship['start_datetime'].apply(lambda dt: dt + timedelta(minutes=10) if pd.notnull(dt) else None)

In [20]:
# Delete unused columns
df_no_ship = df_no_ship.drop(columns=['emorin', 'MongeInes', 'pyleroll'], errors='ignore')

In [21]:
# Remove line from the file that is the transition between two moorings (no annotation but the hydrophone is moving -> we can't see anything)
df_no_ship = df_no_ship[(df_no_ship['filename'] != "01147_MAHY2_20230916_044002.wav") & (df_no_ship['filename'] != "01682_MAHY1_20240925_103740.wav")]

In [22]:
# Save dataframe
df_no_ship.to_pickle(os.path.join(output_path, 'aplose_not_annotated_cleaned.pkl'))

In [23]:
df_no_ship

Unnamed: 0,dataset,filename,start_datetime,end_datetime
84,MAHY_2k_random_v2,01571_MAHY1_20211127_090454.wav,2021-11-27 09:04:54+00:00,2021-11-27 09:14:54+00:00
129,MAHY_2k_random_v2,01210_MAHY1_20220125_001538.wav,2022-01-25 00:15:38+00:00,2022-01-25 00:25:38+00:00
198,MAHY_2k_random_v2,01455_MAHY3_20220422_061047.wav,2022-04-22 06:10:47+00:00,2022-04-22 06:20:47+00:00
354,MAHY_2k_random_v2,01123_MAHY4_20220921_050658.wav,2022-09-21 05:06:58+00:00,2022-09-21 05:16:58+00:00
425,MAHY_2k_random_v2,01220_MAHY1_20221109_051636.wav,2022-11-09 05:16:36+00:00,2022-11-09 05:26:36+00:00
519,MAHY_2k_random_v2,01481_MAHY4_20221221_184649.wav,2022-12-21 18:46:49+00:00,2022-12-21 18:56:49+00:00
523,MAHY_2k_random_v2,01418_MAHY4_20221225_185212.wav,2022-12-25 18:52:12+00:00,2022-12-25 19:02:12+00:00
588,MAHY_2k_random_v2,01228_MAHY4_20230125_113209.wav,2023-01-25 11:32:09+00:00,2023-01-25 11:42:09+00:00
877,MAHY_2k_random_v2,01472_MAHY4_20230506_022938.wav,2023-05-06 02:29:38+00:00,2023-05-06 02:39:38+00:00
903,MAHY_2k_random_v2,01087_MAHY1_20230513_233447.wav,2023-05-13 23:34:47+00:00,2023-05-13 23:44:47+00:00
