<a href="https://colab.research.google.com/github/LianaN/local_humpback_vocalization/blob/main/1_data_acquisition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Acquisition
This notebook downloads the labeled data of humpback vocalizations from [Orcasound's AWS open data repository](https://open.quiltdata.com/b/acoustic-sandbox/tree/humpbacks/Emily-Vierling-Orcasound-data/Em_HW_data/flac_files/). The dataset was prepared by Emily Vierling. It includes ~9,000 labels and is based on ~YY hours of audio data from 3 days during October 03-28, 2021.

In [None]:
#!pip install 'quilt3[pyarrow]'
#!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
# Documentation of quilt3 package: https://docs.quiltdata.com/walkthrough/working-with-a-bucket
import quilt3

import os
import pandas as pd
from IPython.display import Audio, display
from pydub import AudioSegment

In [None]:
# Connect to a public AWS S3 bucket
b = quilt3.Bucket("s3://acoustic-sandbox")

In [None]:
# Download annotation files
b.fetch("humpbacks/Emily-Vierling-Orcasound-data/Em_HW_data/Annotations/", "./sample_data/annotations/")

100%|██████████| 76.8k/76.8k [00:03<00:00, 25.0kB/s]


In [None]:
# Download audio data
b.fetch("humpbacks/Emily-Vierling-Orcasound-data/Em_HW_data/flac_files/", "./sample_data/audio/")

100%|██████████| 993M/993M [00:13<00:00, 72.4MB/s]


In [None]:
# Review annotations data
folder_path = './sample_data/annotations/'
filenames = []

if os.path.exists(folder_path):
    for filename in os.listdir(folder_path):
        print(filename)
        filenames.append(filename)

OS_10_28_2021_18_54_00_.Table.1.selections.txt
OS_10_28_2021_19_55_00_.Table.1.selections.txt
OS_10_28_2021_20_25_00_HB.Table.1.selections.txt
211026-133018-OS-humpback-47min-clip.Table.1.selections.txt
OS_10_28_2021_1900_HB.Table.1.selections.txt
OS_10_03_2021_19_34_00_.Table.1.selections.txt
OS_10_28_2021_19_24_00_.Table.1.selections.txt


In [None]:
# Show the content of an arbitrary annotation file
path = "./sample_data/annotations"

df = pd.read_csv(f"{path}/{filenames[0]}", sep="\t")
df.head()

Unnamed: 0,Selection,Begin Time (s),End Time (s),Low Freq (Hz),High Freq (Hz),Call Type
0,1,370.710682,371.418228,40.533,607.996,Growl
1,2,606.940232,607.411929,364.798,1337.592,Upsweep
2,3,612.735841,615.157221,283.732,2026.654,Moan
3,4,620.314446,622.965406,0.0,1742.923,Moan
4,5,628.248417,630.764136,243.199,1823.989,Descending moan


In [None]:
# Review audio data
folder_path = './sample_data/audio/'
audio_filenames = []

if os.path.exists(folder_path):
    for audio_filename in os.listdir(folder_path):
        print(audio_filename)
        audio_filenames.append(audio_filename)

OS_10_03_2021_19_34_00_.flac
211026-133018-OS-humpback-47min-clip.flac
OS_10_28_2021_19_55_00_.flac
OS_10_28_2021_18_54_00_.flac
OS_10_28_2021_19_24_00_.flac
OS_10_28_2021_1900_HB.flac
OS_10_28_2021_20_25_00_HB.flac


In [None]:
# IPython.display.Audio doesn't support FLAC format directly.
# Therefore we should convert FLAC files to WAV format using pydub and then play them.
path = "./sample_data/audio"

for i in range(len(audio_filenames)):
  audio = AudioSegment.from_file(f"{path}/{audio_filenames[i]}", format="flac")
  audio_filename_wav = audio_filenames[i].replace("flac","wav")
  audio.export(f"{path}/{audio_filename_wav}", format="wav")

In [None]:
# Play an arbitrary audio file
audio_filename_wav = audio_filenames[0].replace("flac","wav")

display(Audio(f"{path}/{audio_filename_wav}", autoplay=True))