<a href="https://colab.research.google.com/github/LianaN/local_humpback_vocalization/blob/main/1_data_acquisition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Acquisition
This notebook downloads the labeled data of humpback vocalizations from [Orcasound's AWS open data repository](https://open.quiltdata.com/b/acoustic-sandbox/tree/humpbacks/Emily-Vierling-Orcasound-data/Em_HW_data/flac_files/). The dataset was prepared by Emily Vierling. It includes ~9,000 labels and is based on ~YY hours of audio data from 3 days during October 03-28, 2021.

In [None]:
#!pip install quilt3[pyarrow]

In [None]:
#!pip install pydub

In [None]:
# Documentation of quilt3 package: https://docs.quiltdata.com/walkthrough/working-with-a-bucket
import quilt3

import os
import pandas as pd
from IPython.display import Audio, display
from pydub import AudioSegment

In [None]:
data_download_folder = "../data"

In [None]:
# Connect to a public AWS S3 bucket
b = quilt3.Bucket("s3://acoustic-sandbox")

In [None]:
# Download annotation files
b.fetch("humpbacks/Emily-Vierling-Orcasound-data/Em_HW_data/Annotations/", f"{data_download_folder}/raw/annotations/")
b.fetch("humpbacks/Emily-Vierling-Orcasound-data/Em_HW_Processed/annotationfiles/specs/", f"{data_download_folder}/preprocessed/annotations/")

In [None]:
# Download audio data
b.fetch("humpbacks/Emily-Vierling-Orcasound-data/Em_HW_data/flac_files/", f"{data_download_folder}/raw/audio/")
b.fetch("humpbacks/Emily-Vierling-Orcasound-data/Em_HW_Processed/annotationfiles/audio/", f"{data_download_folder}/preprocessed/audio/")

In [None]:
# Review annotations data
folder_path = f"{data_download_folder}/raw/annotations/"
filenames = []

if os.path.exists(folder_path):
    for filename in os.listdir(folder_path):
        print(filename)
        filenames.append(filename)

In [None]:
# Show the content of an arbitrary annotation file
path = f"{data_download_folder}/raw/annotations/"

df = pd.read_csv(f"{path}/{filenames[0]}", sep="\t")
df.head()

In [None]:
# Review audio data
folder_path = f"{data_download_folder}/raw/audio/"
audio_filenames = []

if os.path.exists(folder_path):
    for audio_filename in os.listdir(folder_path):
        print(audio_filename)
        audio_filenames.append(audio_filename)

In [None]:
# IPython.display.Audio doesn't support FLAC format directly.
# Therefore we should convert FLAC files to WAV format using pydub and then play them.
path = f"{data_download_folder}/raw/audio"

for i in range(len(audio_filenames)):
  audio = AudioSegment.from_file(f"{path}/{audio_filenames[i]}", format="flac")
  audio_filename_wav = audio_filenames[i].replace("flac","wav")
  audio.export(f"{path}/{audio_filename_wav}", format="wav")

In [None]:
# Play an arbitrary audio file
audio_filename_wav = audio_filenames[0].replace("flac","wav")

display(Audio(f"{path}/{audio_filename_wav}", autoplay=True))