In [14]:
import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

2.5.1+cu124
2.5.1+cu124


In [15]:
filename = 'dr_manns_sound.wav'

Let's load the file:

In [16]:
waveform, sample_rate = torchaudio.load(filename)
print(waveform.shape)
print(sample_rate)

torch.Size([1, 86400000])
24000


Ok, now let's split it up into clips that are 10 seconds long and overlap with each other for 5 seconds:

In [17]:
clip_len = 10   # length of a clip in seconds
overlap = 5      # overlap between clips
step = clip_len-overlap
w = waveform.unfold(dimension=1, size=clip_len*sample_rate, step=step*sample_rate)
print(waveform.shape)
print(w.shape)
w = torch.transpose(w, 0, 1)
print(w.shape)

torch.Size([1, 86400000])
torch.Size([1, 719, 240000])
torch.Size([719, 1, 240000])


Alright, let's generate a mel transformation and another thing to transform the aplitude to decibels:

In [18]:
import torchaudio.transforms as tr

win_length = 2048
hop_length = 512
n_fft = 2048
n_mels = 128
ffts_per_sec = sample_rate/hop_length

mel_transform = tr.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, win_length=win_length,
                                  hop_length=hop_length, n_mels=n_mels)
print(mel_transform)
db_transform = tr.AmplitudeToDB()
print(db_transform)

MelSpectrogram(
  (spectrogram): Spectrogram()
  (mel_scale): MelScale()
)
AmplitudeToDB()


To see what the mel transformation does to the shape of our block, let's apply it to the first block. You can see that instead of 240,000 moments, we have 469 moments, and for each we have 128 mel values, one for each bin:

In [19]:
first_block = w[0]
print(first_block.shape)
mel = mel_transform(first_block)
print(mel.shape)

torch.Size([1, 240000])
torch.Size([1, 128, 469])


But our network will take 416, not 469, so we need to interpolate:

In [20]:
import torch.nn.functional as F
mel = F.interpolate(mel, size=416, mode='linear', align_corners=False)
print(mel.shape)

torch.Size([1, 128, 416])


Nice! Now let's actually apply the mel transformation, the decibel thing and the interpolation to all our clips:

In [21]:
mel_samples=[]
for block in w:
    mel_spec = mel_transform(block)
    mel_spec = db_transform(mel_spec)
    mel_spec = F.interpolate(mel_spec, size=416, mode='linear', align_corners=False)
    mel_samples.append(mel_spec)
print(len(mel_samples))

719


Just to confirm everything is as we expect:

In [22]:
sample = mel_samples[0]
print(sample.shape)

torch.Size([1, 128, 416])


Now that we've gotten our mel spectrograms, it's time to try and convert our Raven annotations to YOLO format. In the original version of this notebook, we actually converted the annotations fully, changing both their time and frequency components to YOLO format. But since Prof. Chiu gave us this new direction of using channels as frequency, we will convert just the time parts.

Let's first load the annotations from the Raven file:

In [23]:
import numpy as np

annotations_filename = 'dr_manns_annotations.txt'

annotations=[]

with open(annotations_filename, "r") as f:
    lines_raw = [line.strip('\n').split('\t') for line in f.readlines()]
    
    del lines_raw[0]
    
    for raw_line in lines_raw:
        time_begin = float(raw_line[3])
        time_end = float(raw_line[4])
        
        # we can use 0 for call and 1 for song
        obj_class = 0 if raw_line[10] == "call" else 1
        
        annotations.append([obj_class, time_begin, time_end])
    
print(len(annotations))

427


Alright, now we can generate the YOLO boxes (1-d boxes) from our annotations. One annotation can correspond to multiple YOLO boxes because it might extend beyond the end of a clip. To solve this problem we use... a hashmap!

In [24]:
from math import floor

yolo_box_map = dict()    # a hashmap

# insert a box into the yolo box map
def insert_box(clip_num, annotation):
    
    # get the absolute time in the audio when a clip begins
    abs_clip_begin_time = clip_num*step
    # when does the box begin within the clip?
    box_begin_time = annotation[1]-abs_clip_begin_time
    # when does the box end?
    clip_end = abs_clip_begin_time + clip_len
    
    if annotation[2] <= clip_end:
        box_end_time = annotation[2]-abs_clip_begin_time
    else:
        box_end_time = clip_len     # clip_end - abs_clip_begin_time
        insert_box(clip_num+1, annotation)
    
    # ok, now we need to normalize and find the center
    # so first normalize...
    box_begin_x = box_begin_time/clip_len
    box_end_x = box_end_time/clip_len
    
    # then get the center and width
    center_x = (box_begin_x+box_end_x)/2
    width = box_end_x - box_begin_x
    
    # and... record it!
    yolo_box = [annotation[0], center_x, width]
    
    if clip_num in yolo_box_map:
        yolo_box_map[clip_num].append(yolo_box)
    else:
        yolo_box_map[clip_num] = [yolo_box]

    
for annotation in annotations:
    
    # there may be multiple clips going on right now (because of overlap)
    # so first order of business is to figure out which clip started most recently
    time_begin = annotation[1]
    clip_num = floor(time_begin/step)
    
    while (clip_num*step + clip_len) > time_begin:
        insert_box(clip_num, annotation)
        clip_num -= 1

print(len(yolo_box_map.keys()))

total_boxes=0
for k in yolo_box_map.keys():
    total_boxes += len(yolo_box_map[k])
print(total_boxes)

240
929


Ok, hopefully those numbers are right. There's isn't really an easy way to check.

Now here's a function to save the bounding boxes for a given clip. We actually will save these to the "labels" directory and use them as labels.

In [25]:
def save_boxes(clip_num):
    
    if clip_num not in yolo_box_map:
        return
    
    boxes = yolo_box_map[clip_num]
    
    with open(f"labels/{clip_num}.txt", "w") as f:
        for box in boxes:
            f.write(f"{box[0]} {box[1]} {box[2]}\n")

Finally we are ready to iterate through everything and save the spectrograms (as pytorch tensors) and the bounding boxes (using our function above). But before we save, we make a view without the original (audio) channels dimension (which is just 1 in our case), so that the pytorch Dataset class in our model can manage the batch dimension on its own.

In [26]:
for i in range(len(mel_samples)):
    
    spectrogram = mel_samples[i]
    
    # shape goes from [1, 128, 416] to [128, 416]
    spectrogram = spectrogram.view(128, 416)
    
    # save the tensor
    torch.save(spectrogram, f"images/{i}.pt")
    # save the bounding boxes
    save_boxes(i)