This notebook is used to convert the a given set of WAV files to spectrograms. These spectrograms will be stored in the "data" directory. 

In [None]:
!conda install -y -c conda-forge librosa

In [None]:
!pip install mxnet

In [None]:
import math
import pandas as pd
import numpy as np
import librosa
import warnings
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from scipy.io import wavfile
from collections import OrderedDict
from tqdm import tqdm
import pickle
import json
import glob
import os
from os import path
import boto3
from PIL import Image
import json
from os.path import exists
import io


In [None]:
left_col, right_col = "Begin Time (s)", "End Time (s)"
top_col, bot_col = "High Freq (Hz)", "Low Freq (Hz)"
class_col, class_conf_col = "Species", "Species Confidence"

recording_dir = "./"
annotation_dir = "./"
output_dir = "./data"
label_map_name = "label_map.pbtxt"
metadata_name = "dataset_metadata.txt"
JSON_dir = "./json"
manifest_file_name = "manifest_file.jsonl"
lst_file_name = 'lst_file.lst'

# SPECTROGRAM CONSTANTS
# Window size (n_fft) in seconds
WINDOW_SIZE_SEC = 3/20
# Hop Length in seconds
HOP_LEN_SEC = 15/300
# Number of frequency bands (y dimension of spectrogram)
N_MELS = 300
# Maximum frequency considered (highest value in y dimension)
FREQUENCY_MAX = 1600

# CHUNK CONSTANTS
# Length of one chunk in seconds
TRAIN_CHUNK_SIZE_SEC = 45
EVAL_CHUNK_SIZE_SEC = 15
# Minimum % visibility of a call to keep annotation
MIN_BOX_PERCENT = 0.3

# DATASET SETTINGS
dataset_name = "dataset.record"
NUM_TRAIN_SHARDS = 3
NUM_EVAL_SHARDS = 5
NUM_EVAL_FILES = 2

# Constructs the dataset without certain classes
DISALLOWED_CLASSES = ["?", "rf", "sl"]

In [None]:
#check if needed
with open(path.join(output_dir, metadata_name), 'w') as metafile:
    json.dump(
        {
            "WINDOW_SIZE_SEC": WINDOW_SIZE_SEC,
            "HOP_LEN_SEC": HOP_LEN_SEC,
            "N_MELS": N_MELS,
            "FREQUENCY_MAX": FREQUENCY_MAX,
            "TRAIN_CHUNK_SIZE_SEC": TRAIN_CHUNK_SIZE_SEC,
            "EVAL_CHUNK_SIZE_SEC": EVAL_CHUNK_SIZE_SEC,
            "EVAL_CHUNK_STEP_SEC": EVAL_CHUNK_SIZE_SEC / 2.0
        },
        metafile
    )

In [None]:
#creates a connection to bucket
s3 = boto3.resource('s3')
bucket = s3.Bucket('monitoring-whale-recordings')


In [None]:
#reads in a wav file
def read_wavfile(wav_name, normalize=True, verbose=False):
    file_name = f"{wav_name}_processed.wav"
    bucket_path = f"wav-files/decimated_files/{file_name}"
    bucket.download_file(bucket_path, file_name)
    if verbose:
        print("Reading {}".format(file_name))
    sr, data = wavfile.read(file_name)
    os.remove(file_name)
    if verbose:
        print("{} samples at {} samples/sec --> {} seconds".format(data.shape[0], sr, data.shape[0]/sr))

    if normalize:
        data = data.astype(float)
        data = data - data.min()
        data = data / data.max()
        data = data - 0.5
    return sr, data

#Tries to find corresponding annotation file for all of the annotators
def read_annotations(fname, verbose=False):
    annotators = ['AS.txt', 'AW.txt', 'JW.txt', 'MS.txt', 'SS.txt']
    
    
    for annotator in annotators:
        file_name = f"{fname}-{annotator}"
#         file_name = f"{fname}-AW.txt"
        bucket_path = f"selection-tables/{file_name}"
        try:
            bucket.download_file(bucket_path, file_name)
            break
        except Exception:
                continue
    
    annotations = pd.read_csv(file_name, sep="\t")
    try:
        annotations = annotations.loc[annotations["Species"] == "hb"]
    except Exception as e:
        pass
    
    try:
        annotations = annotations.loc[annotations["Spcies"] == "hb"]
    except Exception as e:
        pass
    
    if verbose:
        print("Read {} annotations from {}".format(len(annotations), fname))
        print("Columns:", ",".join([" {} ({})".format(c, type(c)) for c in annotations.columns]))
    os.remove(file_name)
    return annotations

This section includes a function that creates the training, testing, and validation set we used for our training. It also includes an "incorrect_dataset" which contains the names of wav files that had something wrong with them that was causing problems. The function below pulls all the wav files from the "monitoring-whale-recordings". It then removes the hardcoded wav files reserved for testing and validation purposes. The remaining files become the training set. 

In [None]:
#gets all of the data sets. The train and validation sets have ben hardcoded, feel free to modify.
def get_data_sets():
    testing_set = ['671658014.181008003414']

    #dataset with misspelled columns
    incorrect_dataset = ['671658014.181003123500']

    validation_set = ['671658014.181008033412']

    a = s3.Bucket('monitoring-whale-recordings')
    annotatedFiles = [file.key.split("/")[1] for file in a.objects.all() if (file.key[-1] != '/' and file.key.split("/")[0] == "selection-tables")]
    dataset = [file.split("-")[0] for file in annotatedFiles]
#     train_dataset = [el for el in dataset if not el in validation_set and not el in testing_set]
    notAllowedSet = testing_set + incorrect_dataset + validation_set
    train_set = [file for file in dataset if all(file not in notAllowed for notAllowed in notAllowedSet)]
    
    return train_set, incorrect_dataset, validation_set, testing_set

train_set, incorrect_dataset, validation_set, testing_set = get_data_sets()

In [None]:
train_set

In [None]:
def get_all_classes(annotation_fnames, verbose=False):
    """
    Returns a list of all classes seen in the annotation files sorted
    alphabetically.
    """
    classes = set()
    for annot_fname in annotation_fnames:
        try:
            classes.update(list(read_annotations(annot_fname)[class_col].unique()))
        except Exception as e:
            pass
    print(classes)
    classes = sorted([s for s in list(classes)])
    if verbose:
        print("Classes: ", classes)
    return classes


# Generates the necessary prototext file for the class mapping.
# Classes are assigned to the integer 1 greater than their index.
# The resulting file is saved to output_path.
def create_label_map(classes, output_path):
    label_map = string_int_label_map_pb2.StringIntLabelMap()
    for i, cls in enumerate(classes):
        new_item = label_map.item.add() # StringIntLabelMapItem
        new_item.name = cls          # String name. The most common practice is to set this to a MID or synsets id.
        new_item.id = 1+i            # Integer id starting from 1
        new_item.display_name = cls  # Human readable text label
    with open(output_path, "w") as f:
        f.write(text_format.MessageToString(label_map))
        

classes = get_all_classes(train_set, verbose=True)
classes = [c for c in classes if c not in DISALLOWED_CLASSES]
    

class_map = {}
rev_class_map = {}
for i in range(len(classes)):
    class_map[i+1] = classes[i]
    rev_class_map[classes[i]] = i+1


In [None]:
def get_area(annotation):
    return ((annotation[right_col] - annotation[left_col])
            * (annotation[top_col] - annotation[bot_col]))


# Per-channel energy normalization
def PCEN(spec, M_return_timestep, init_val=None, epsilon=1e-6, s=0.001, alpha=0.80, delta=2.0, r=0.5):
    output = np.zeros_like(spec)
    if M_return_timestep < 0 or M_return_timestep > spec.shape[1]-1:
        print("Warning! M return timestep is outside bounds. Not returning any M.")
    if init_val is None:
        M = np.zeros(shape=(output.shape[0]))
    else:
        M = np.array(init_val)
    assert M.shape[0] == output.shape[0]
    out_M = None
    for t in range(output.shape[1]):
        M = (1 - s) * M + s * spec[:,t]
        output[:,t] = ((spec[:,t] / ((M + epsilon) ** alpha)) ** r) - (delta ** r)
        if t == M_return_timestep:
            out_M = M
    return output, out_M


# Returns the min and max db observed in all wav files
def get_minmax_bounds(wav_filenames, chunk_size=TRAIN_CHUNK_SIZE_SEC):
    min_val, max_val = None, None
    for wfname in wav_filenames:
        sr, data = read_wavfile(wfname, normalize=True)
        n_fft = int(WINDOW_SIZE_SEC * sr)
        hop_len = int(HOP_LEN_SEC * sr)
        chunk_size = int(chunk_size * sr)
        step = chunk_size - (hop_len * (N_MELS-2) + n_fft)
        M_init = None
        for start_i in range(0, len(data), step):
            mel_spec = librosa.feature.melspectrogram(y=data[start_i:min(len(data),start_i+chunk_size)],
                                                      sr=sr,
                                                      n_fft=n_fft,
                                                      hop_length=hop_len,
                                                      n_mels=N_MELS,
                                                      fmax=FREQUENCY_MAX,
                                                      center=False)
            #mel_spec, M_init = PCEN(mel_spec, step // hop_len, init_val=M_init)
            mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
            temp_min = mel_spec.min()
            temp_max = mel_spec.max()
            if min_val is None or temp_min < min_val:
                min_val = temp_min
            if max_val is None or temp_max > max_val:
                max_val = temp_max
    return min_val, max_val


#this is the function that creates the spectrograms and the lst file
def process_file(wav_filename, annot_filename, min_bound, max_bound, chunk_size, lst_file_name, chunk_layout="dense",
                 drop_last_chunk=False, verbose=False):
    sr, data = read_wavfile(wav_filename, normalize=True, verbose=verbose)
    annotations = read_annotations(annot_filename, verbose=verbose)
    
    n_fft = int(WINDOW_SIZE_SEC * sr)
    hop_len = int(HOP_LEN_SEC * sr)
    chunk_size = int(chunk_size * sr)
    
    if chunk_layout == "dense":
        step = chunk_size - (hop_len * (N_MELS-2) + n_fft)
    elif chunk_layout == "sparse":
        step = chunk_size // 2
    
    # Start Indices of each chunk
    start_vals = [s for s in range(0, len(data), step)]
    
    # If last cut point creates a tiny chunk, remove it
    if len(data) - start_vals[-1] < int(chunk_size / 2):
        start_vals = start_vals[:-1]
        

    def extract_chunk(start_i, end_i, spec_name, annot_name, json_name, index, use_pcen=True, M_init=None):
        mel_spec = librosa.feature.melspectrogram(y=data[start_i:end_i],
                                                  sr=sr,
                                                  n_fft=n_fft,
                                                  hop_length=hop_len,
                                                  n_mels=N_MELS,
                                                  fmax=FREQUENCY_MAX,
                                                  center=False)
        #mel_spec, next_M_init = PCEN(mel_spec, step // hop_len, init_val=M_init)
        next_M_init = None
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec = np.clip((mel_spec - min_bound) / (max_bound - min_bound) * 255, a_min=0, a_max=255)
        mel_spec = mel_spec.astype(np.uint8)
        spec_height, spec_width = mel_spec.shape


        # Get annotations to those inside chunk
        start_s, end_s = start_i/sr, end_i/sr
        freq_axis_low, freq_axis_high = librosa.hz_to_mel(0.0), librosa.hz_to_mel(FREQUENCY_MAX)
        chunk_annotations = annotations.loc[~((annotations[left_col] > end_s)
                                              | (annotations[right_col] < start_s))].copy()
        print(start_s, end_s)

    #         createJSON(chunk_annotations)

        # Rescale axes to 0.0-1.0 based on location inside chunk
        chunk_annotations.loc[:,[left_col,right_col]] = ((chunk_annotations[[left_col,right_col]]
                                                         - start_s) / (end_s - start_s))

        chunk_annotations.loc[:,[bot_col,top_col]] = (1.0 - ((librosa.hz_to_mel(chunk_annotations[[bot_col,top_col]])
                                                      - freq_axis_low) / (freq_axis_high - freq_axis_low)))
        chunk_annotations = chunk_annotations.loc[chunk_annotations[class_col].isin(classes)]
        trimmed_annots = chunk_annotations.copy()
        trimmed_annots[left_col] = trimmed_annots[left_col].clip(lower=0, upper=1.0)
        trimmed_annots[right_col] = trimmed_annots[right_col].clip(lower=0, upper=1.0)
        trimmed_annots[bot_col] = trimmed_annots[bot_col].clip(lower=0, upper=1.0)
        trimmed_annots[top_col] = trimmed_annots[top_col].clip(lower=0, upper=1.0)



        overlaps = []
        for i in trimmed_annots.index:
            intersection = trimmed_annots.loc[i]
            original = chunk_annotations.loc[i]
            original_area = get_area(original)
            overlaps.append((get_area(intersection)*spec_height*spec_width) / original_area)
        chunk_annotations = trimmed_annots.loc[np.array(overlaps) > MIN_BOX_PERCENT]


        if verbose:
            print("Found {} annotations in chunk".format(len(chunk_annotations)))



        if verbose:
            print("Saved spectrogram to '{}'".format(spec_name))

        image_filepath = path.join(output_dir, spec_name)
        example_dict = {
            "filepath": spec_name,
            "height": spec_height,
            "width": spec_width,
            "xmins": trimmed_annots[left_col].tolist(),
            "xmaxs": trimmed_annots[right_col].tolist(),
            "ymins": trimmed_annots[top_col].tolist(),
            "ymaxs": trimmed_annots[bot_col].tolist(),
            "classes_text": trimmed_annots[class_col].tolist(),
            "classes": trimmed_annots[class_col].map(rev_class_map).tolist()
        }
    #     annots = createJSON(example_dict)

#             Save Chunk as PNG image (lossless compression)
        im = Image.fromarray(mel_spec[::-1, :])
        im = im.convert("L")

        image_filepath = path.join(output_dir, spec_name)
        im.save(image_filepath)

        if(len(example_dict["xmins"]) == 0):
            return example_dict, next_M_init
        
        if(len(example_dict["xmins"]) == 0):
            print(index)
        res = [index, 2, 5]
        for i in range(len(example_dict["xmins"])):
            temp = [0, example_dict["xmins"][i], example_dict["ymins"][i], example_dict["xmaxs"][i], example_dict["ymaxs"][i]]
            res.extend(temp)
        
        res.append(image_filepath) 

        text = "\t".join([str(el) for el in res])
        with open(lst_file_name, "a") as f:
            f.write(text)
            f.write('\n')

        return example_dict, next_M_init
    
    
    # Actually iterate through the file and extract chunks
    examples = []
    M_init = None
    for ind, start_i in enumerate(start_vals[:-1]):
        spec_name = "{}-{}.png".format(wav_filename, ind)
        annot_name = "{}-{}-labels.txt".format(wav_filename, ind)
        json_name = f"{wav_filename}.jsonl"
        ex, M_init = extract_chunk(start_i, start_i+chunk_size, spec_name, annot_name, json_name, ind, M_init=M_init)
        examples.append(ex)
    if not drop_last_chunk:
        spec_name = "{}-{}.png".format(wav_filename, len(start_vals)-1)
        annot_name = "{}-{}-labels.txt".format(wav_filename, len(start_vals)-1)
        json_name = f"{wav_filename}.jsonl"
        ex, _ = extract_chunk(start_vals[-1], len(data), spec_name, annot_name, json_name, len(start_vals)-1, M_init=M_init)
        examples.append(ex)
    return examples

This function removes all the spectrograms from the data folder.

In [None]:
def cleanup():
    !rm data/*



In [None]:

def create_lst_file(dataset, lst_file_name):
    index = 0
    for file in dataset:
    #     if index > 0:
    #         break
        print(f"{index + 1}/{len(dataset)} wav files converted")
        index += 1
        process_file(file, file, -80.0, 0, TRAIN_CHUNK_SIZE_SEC, lst_file_name,chunk_layout="dense", drop_last_chunk=False, verbose=False)

#takes in a data set, and then creates the sepctrograms and the corresponding rec file. 
#Make sure to call cleanup before every call to this function
def create_rec_file(lst_file_name):
    RESIZE_SIZE = 256
    !python im2rec.py --resize $RESIZE_SIZE --pack-label $lst_file_name .


In [None]:


def remove_Lst_fileIfOpen(file_name):
    if exists(file_name):
        print(f"{file_name} exists, removing now")
        !rm $file_name

    
#copies file from local notebook instance to sagemaker bucket
def copy_to_bucket(fileSource, fileDestination):
    #copies it into our bucket
    write_bucket = s3.Bucket('sagemaker-us-west-2-959616474350')
    write_bucket.upload_file(fileSource, fileDestination)
    

This functions takes in a list of WAV file names, and the name of the rec file the annotations need to be stored in, and then creates the corresponding spectrograms and rec file for the WAV files.

In [None]:
def final_rec_file(dataset, file_prefix):
    lst_file_name = f"{file_prefix}.lst"
    remove_Lst_fileIfOpen(lst_file_name)
    create_lst_file(dataset, lst_file_name)
    create_rec_file(lst_file_name)

Remember to call cleanup before any call to final_rec_file

In [None]:
#removes all the spectrograms in the data folder
cleanup()

In [None]:
#creates spectrograms and rec file
final_rec_file(train_set, "train_full")

In [None]:
#copies rec file from here to the bucket
copy_to_bucket("train_full.rec", "train/train.rec")

In [None]:
cleanup()

In [None]:
final_rec_file(validation_set, "val")

In [None]:
copy_to_bucket("val.rec", "validation/validation.rec")