In [1]:
import pandas as pd
import numpy as np
import os
import librosa
from tqdm.notebook import tqdm

In [None]:
# set paths
audio_path = '/hyena/cc16_366a/cc16_366a_audio/'
labels_path = '/hyena/cc16_366a/cc16_366a_audit/'
save_path = 'cc16_366a'

In [2]:
def create_label_dataframe(label, begin_time, end_time,
                           window_size, timesteps_per_second):
    """Create dataframe, reformated and containing relevant information.
    
    # Arguments
        label: label path.
        begin_time: start time for the related spectrogram.
        end_time: end time for the related spectrogram.
        window_size: end time - start time.
        timesteps_per_second: spectrogram timesteps / spectrogram window_size.

    # Returns
        Dataframe with relevant label information for 
        the spectrogram file.
    """
    labels_df = pd.read_csv(label,
                            sep='\t',
                            index_col='Selection')
    # some label columns called 'Label' and some 'Labels' so change to 'Label'
    labels_df.columns = ['Label' if x=='Labels' or x=='Lable'
                         else x for x in labels_df.columns]
    if 'Label' not in labels_df.columns:
        print('No label column for file: ' + label)
    else:
        call_labels = ['GIG', 'SQL', 'GRL', 'GRN', 'SQT', 'MOO', 'RUM', 'WHP']
        # keep only first 3 letters of label
        labels_df.Label = labels_df.Label.str[0:3]
        # keep calls only
        labels_df = labels_df[labels_df['Label'].isin(call_labels)]
        # filter for start and end time
        labels_df = labels_df[labels_df['Begin Time (s)'] <= end_time]
        labels_df = labels_df[labels_df['End Time (s)'] >= begin_time]
        labels_df.loc[labels_df['End Time (s)'] > end_time, 'End Time (s)'] \
        = end_time
        labels_df.loc[labels_df['Begin Time (s)'] < begin_time, 'Begin Time (s)'] \
        = begin_time
        # convert from seconds to timesteps
        labels_df['Begin Time(t)'] = ((labels_df['Begin Time (s)'] - begin_time) \
                                      * timesteps_per_second).apply(np.floor)
        labels_df['End Time(t)'] = ((labels_df['End Time (s)'] - begin_time) \
                                    * timesteps_per_second).apply(np.ceil)
    return labels_df

In [3]:
def create_label_matrix(dataframe, timesteps):
    """Create label matrix of shape (number of classes, timesteps).
    
    # Arguments
        dataframe: dataframe of label information.
        timesteps: number of timesteps.

    # Returns
        Matrix of 0s and 1s. Each column represents a timestep,
        Each row represents a different call type:
        Row 0 = Giggle (GIG)
        Row 1 = Squeal (SQL)
        Row 2 = Growl (GRL)
        Row 3 = Groan (GRN)
        Row 4 = Squitter (SQT)
        Row 5 = Low / Moo (MOO)
        Row 6 = Alarm rumble (RUM)
        Row 7 =  Whoop (WHP)

    # Example
        [[0, 0, 0, 0, 0, 0 ...],
        [0, 0, 0, 0, 0, 0 ...],
        [0, 0, 0, 1, 1, 1 ...], Growl in timesteps 3, 4, 5.
        [0, 0, 0, 0, 0, 0 ...],
        [0, 0, 0, 0, 0, 0 ...],
        [0, 0, 0, 0, 0, 0 ...],
        [0, 0, 0, 0, 0, 0 ...],
        [1, 1, 1, 1, 0, 0 ...],] Whoop in timesteps 0, 1, 2, 3.
    """
    label = np.zeros((8, timesteps))
    if 'Label' in list(dataframe):
        # create update list
        update_list = []
        for index, row in dataframe.iterrows():
            update_list.append([row['Begin Time(t)'],
                                row['End Time(t)'],
                                row['Label']])
        # overwrite with 1s in correct row based on label
        for l in update_list:
            begin_t = int(l[0])
            end_t = int(l[1]) + 1
            if l[2] == 'GIG':
                label[0][begin_t:end_t] = 1
            elif l[2] == 'SQL':
                label[1][begin_t:end_t] = 1
            elif l[2] == 'GRL':
                label[2][begin_t:end_t] = 1
            elif l[2] == 'GRN':
                label[3][begin_t:end_t] = 1
            elif l[2] == 'SQT':
                label[4][begin_t:end_t] = 1
            elif l[2] == 'MOO':
                label[5][begin_t:end_t] = 1
            elif l[2] == 'RUM':
                label[6][begin_t:end_t] = 1
            elif l[2] == 'WHP':
                label[7][begin_t:end_t] = 1
    return label

In [5]:
window_size = 6
slide = 6
mels = 64
sample_rate = 22_050
timesteps = 259
timesteps_per_second = timesteps / window_size

In [6]:
dataset = []

In [None]:
for w in tqdm(os.listdir(audio_path), desc='load_audio'):
    filepath = audio_path + w
    # load audio
    y, sr = librosa.load(filepath, sr=sample_rate, mono=True)
    length = int(len(y) / sr)
    remainder = length % window_size
    for t in tqdm(range(0, length - remainder - window_size, slide),
                  desc='create_spectros'):
        start = t
        stop = t + window_size
        current_y = y[sr*start:(sr*stop)]
        # create spectrogram
        spectro = librosa.feature.melspectrogram(y=current_y, sr=sr, n_mels=mels,
                                                 fmax=sr/2)
        audio_id = w[:-7]
        ds_id = audio_id + '_' + str(start) + 'sto' + str(stop) + 's'
        # find related labels
        label_list = [l for l in os.listdir(labels_path) if audio_id in l]
        label = labels_dir + label_list[0]
        # create label matrix
        df = create_label_dataframe(label, start, stop, window_size,
                                    timesteps_per_second)
        label_matrix = create_label_matrix(df, timesteps)
        dataset.append([ds_id, spectro, label_matrix])

In [None]:
# save dataset
ds_array = np.asarray(dataset)
np.save(save_path, ds_array)