In [1]:
import numpy as np
import pydub
import requests
import os
import json

### mp3 to nparray
You need ffmpeg installed and added to path to use pydub

In [2]:
def mp3_to_np(mp3_file, normalized=False):
    """MP3 to numpy array"""
    a = pydub.AudioSegment.from_mp3(mp3_file)
    y = np.array(a.get_array_of_samples())
    if a.channels == 2:
        y = y.reshape((-1, 2))
    if normalized:
        return a.frame_rate, np.float32(y) / 2**15
    else:
        return a.frame_rate, y

def np_to_mp3(dest_file, sr, x, normalized=False):
    """numpy array to MP3"""
    channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
    if normalized:  # normalized array - each item should be a float in [-1, 1)
        y = np.int16(x * 2 ** 15)
    else:
        y = np.int16(x)
    song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
    song.export(dest_file, format="mp3", bitrate="320k")

#### Download select mp3 files for local machine testing

In [17]:
SELECT_SPECIES = ["Pale-legged_Leaf_Warbler", "Black-crested_Bulbul", "Blue_Whistling_Thrush", 
                  "Orange-bellied_Flowerpecker", "Great_Slaty_Woodpecker"]
PATH = "./data/mp3/"

for filename in os.listdir("./data"):
    if filename[:-5] in SELECT_SPECIES:
        with open("./data/" + filename, 'r') as speciesFile:
            species_data = json.load(speciesFile)
            #for i in range(0, len(species_data)):
            for i in range(0, 3):            #just grabbing a few for now 
                d = species_data[i]
                if d["file-name"].endswith(".mp3"):
                    mp3_url = d["file"]
                    r = requests.get(mp3_url, allow_redirects=True)
                    open(PATH + filename[:-5] + f"_{i}.mp3", 'wb').write(r.content)

In [3]:
numpy_audio_sample = mp3_to_np("data/mp3/Blue_Whistling_Thrush_0.mp3")

In [6]:
np_to_mp3("data/mp3/test.mp3", numpy_audio_sample[0], numpy_audio_sample[1])

In [9]:
print("Audio length in seconds: ")
print(len(numpy_audio_sample[1])/numpy_audio_sample[0])

Audio length in seconds: 
31.978666666666665


In [4]:
# Finds the max-sum window (sub array) in a given np array.
#
# array = array or list to parse
# sr = sample rate
# samples_per_window = length of desired window 
# stride_coeff = controls stride of sliding window. stride = stride_coeff * sr
#              (In other words, a stride_coeff = .5 means a stride of half a second)
#
# Returns the start (i) and end (j) indicies of the window in the given array\
#
# *** Perhaps in the future we should punish window values near the start and end. This would help center the bird call ***

def find_max_window(array, sr, samples_per_window, stride_coeff=.5):
    if samples_per_window > len(array):
        return (-1, -1)
    
    stride = int(stride_coeff * sr)
    if stride > len(array):
        return (-1, 1)
    
    if isinstance(array, list):
        array = np.array(array)
    
    array = abs(array)
    max_sum = 0
    best_i, best_j = -1, -1
    i, j = 0, samples_per_window
    
    while j <= len(array):
        curr_sum = sum(array[i:j])
        if curr_sum > max_sum:
            max_sum = curr_sum
            best_i, best_j = i, j
        i += stride
        j += stride
    
    return (best_i, best_j)
    

In [5]:
# Extracts max total magnitude windows "window_size" seconds long.
#
# audio_array = 1D numpy array representing audio file
# sr = sample rate
# window_size = desired length of windows in seconds
#
# Returns a list of numpy arrays (windows)

def extract_best_windows(audio_array, sr, window_size=5):
    samples_per_window = sr * window_size
    if samples_per_window > len(audio_array):
        return []
    
    ret = []
    start, end = find_max_window(audio_array, sr, samples_per_window)
    max_window = audio_array[start:end]
    
    ret.append(max_window)
    ret.extend(extract_best_windows(audio_array[0:start], sr, window_size=5))
    ret.extend(extract_best_windows(audio_array[end:], sr, window_size=5))
    
    return ret
    
    

In [6]:
# run this cell to test :)
a = extract_best_windows(numpy_audio_sample[1], numpy_audio_sample[0])
# np_to_mp3("data/mp3/test0.mp3", numpy_audio_sample[0], a[0])
# np_to_mp3("data/mp3/test1.mp3", numpy_audio_sample[0], a[1])
# np_to_mp3("data/mp3/test2.mp3", numpy_audio_sample[0], a[2])
# np_to_mp3("data/mp3/test3.mp3", numpy_audio_sample[0], a[3])

In [8]:
np.shape(a[0])

(240000,)