In [2]:
pip install librosa

Collecting librosa
  Using cached librosa-0.10.1-py3-none-any.whl (253 kB)
Collecting lazy-loader>=0.1
  Using cached lazy_loader-0.3-py3-none-any.whl (9.1 kB)
Collecting numba>=0.51.0
  Using cached numba-0.57.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)
Collecting soundfile>=0.12.1
  Using cached soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl (1.2 MB)
Collecting soxr>=0.3.2
  Downloading soxr-0.3.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting msgpack>=1.0
  Using cached msgpack-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (316 kB)
Collecting audioread>=2.1.9
  Using cached audioread-3.0.0-py3-none-any.whl
Collecting pooch>=1.0
  Using cached pooch-1.7.0-py3-none-any.whl (60 kB)
Collecting llvmlite<0.41,>=0.40.0dev0
  Using cached llvmlite-0.40.1-cp310-cp310-manylinu

In [73]:
import sys
sys.path.append('../')

import librosa
import boto3
import os
from src.s3_utils import download_s3_object_to_memory, read_audio_fromS3, read_selection_table_fromS3, trim_audio_file
import yaml
import pandas as pd
import soundfile as sf
import io

In [42]:
# Load the S3 credentials from a YAML file
with open('config/connection_config.yaml', 'r') as f:
    credentials = yaml.safe_load(f)

# Extract the access key and secret access key
access_key = credentials['access_key']
secret_access_key = credentials['secret_access_key']

# Connect to the S3 bucket
bucket_name = 'sabah'
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_access_key)

In [43]:
# create a data frame containing a referencing selection_tables (that containt labels) to the coresponding soundfile
slctn_tbls_to_sndfls_bytes = download_s3_object_to_memory(bucket_name, 'labels/selection_tables_to_soundfiles.csv', s3)
slctn_tbls_to_sndfls_df = pd.read_csv(slctn_tbls_to_sndfls_bytes)
slctn_tbls_to_sndfls_df.head()

Unnamed: 0,selection_table,soundfile,selection_table_directory,soundfile_directory
0,ELOC26_1677321565102_2023-03-03_08-39-25.Table...,ELOC26_1677321565102_2023-03-03_08-39-25.wav,labels/selection_tables/ELOC26_1678094402251/E...,soundfiles/ELOC26_1678094402251/ELOC26_1677321...
1,ELOC26_1677321565102_2023-03-03_09-39-25.Table...,ELOC26_1677321565102_2023-03-03_09-39-25.wav,labels/selection_tables/ELOC26_1678094402251/E...,soundfiles/ELOC26_1678094402251/ELOC26_1677321...
2,ELOC26_1677321565102_2023-03-03_10-39-25.Table...,ELOC26_1677321565102_2023-03-03_10-39-25.wav,labels/selection_tables/ELOC26_1678094402251/E...,soundfiles/ELOC26_1678094402251/ELOC26_1677321...
3,ELOC26_1677321565102_2023-03-03_13-39-25.Table...,ELOC26_1677321565102_2023-03-03_13-39-25.wav,labels/selection_tables/ELOC26_1678094402251/E...,soundfiles/ELOC26_1678094402251/ELOC26_1677321...
4,ELOC6_1678095504743_2023-03-06_11-38-25.Table....,ELOC6_1678095504743_2023-03-06_11-38-25.wav,labels/selection_tables/ELOC6_1678095504743/EL...,soundfiles/ELOC6_1678095504743/ELOC6_167809550...


In [45]:
client = s3

In [83]:
for i, row in slctn_tbls_to_sndfls_df.iterrows():
    # read the audio file
    audio_file_path = row['soundfile_directory']
    audio, sr = read_audio_fromS3(audio_file_path, bucket_name, client) 
    # read the selection table
    selection_table_path = row['selection_table_directory']
    selection_table = read_selection_table_fromS3(selection_table_path, bucket_name, client)
    
    for i, label in selection_table.iterrows():
        start, end = label["Begin Time (s)"], label["End Time (s)"]
        clip = trim_audio_file(audio, start, end, sr)
        sound_category = label['sound_category']
        sound_type = label['sound_type']
        label_id = label["label_id"]
        
        output_dir = f"soundfiles_trimmed/{sound_category}/{sound_type}/"
        
        filename = f"{label_id}.wav"  # Name the file using label_id
        output_path = os.path.join(output_dir, filename)
        
        with io.BytesIO() as audio_file:
            sf.write(audio_file, clip, sr, format='WAV', subtype='PCM_24')
            audio_file.seek(0)
            print(output_path)
            # Upload the audio file to S3
            s3.upload_fileobj(audio_file, bucket_name, output_path, ExtraArgs={'ContentType': "audio/wav"})   

soundfiles/ELOC26_1678094402251/ELOC26_1677321565102_2023-03-03_08-39-25.wav
labels/selection_tables/ELOC26_1678094402251/ELOC26_1677321565102_2023-03-03_08-39-25.Table.1.selections.txt
soundfiles_trimmed/elephant_vocalization/roar/roar_ELOC26_1677321565102_2023-03-03_08-39-25.Table.1.selections_1.wav
soundfiles_trimmed/elephant_vocalization/longroar/longroar_ELOC26_1677321565102_2023-03-03_08-39-25.Table.1.selections_2.wav
soundfiles/ELOC26_1678094402251/ELOC26_1677321565102_2023-03-03_09-39-25.wav
labels/selection_tables/ELOC26_1678094402251/ELOC26_1677321565102_2023-03-03_09-39-25.Table.1.selections.txt
soundfiles_trimmed/elephant_vocalization/longroar/longroar_ELOC26_1677321565102_2023-03-03_09-39-25.Table.1.selections_1.wav
soundfiles_trimmed/elephant_vocalization/longroar/longroar_ELOC26_1677321565102_2023-03-03_09-39-25.Table.1.selections_2.wav
soundfiles_trimmed/elephant_vocalization/longroar/longroar_ELOC26_1677321565102_2023-03-03_09-39-25.Table.1.selections_3.wav
soundfiles_

In [63]:
len(clip)/16000

3.888375

In [81]:
bucket_name

'sabah'