In [1]:
import numpy as np
import pydub
import requests
import os
import json
import noisereduce as nr
import librosa
import librosa.display
from scipy.io import wavfile
import matplotlib.pyplot as plt
from PreprocessingFunctions import *

#### Download select mp3 files for local machine testing

In [None]:
PATH = "./data/mp3/"

for filename in os.listdir("./data"):
    if filename.endswith(".json"):
        with open("./data/" + filename, 'r') as speciesFile:
            species_data = json.load(speciesFile)
            for i in range(0, len(species_data)):
                d = species_data[i]
                if d["file-name"].endswith(".mp3") and d["smp"]=="48000" and d["q"] != "E" and d["q"] != "F":
                    mp3_url = d["file"]
                    r = requests.get(mp3_url, allow_redirects=True)
                    open(PATH + filename[:-5] + f"_{i}.mp3", 'wb').write(r.content)

#### Make data windows

In [None]:
PATH_TO_DATA_MP3 = "./data/mp3"
desired_files = [x for x in os.listdir(PATH_TO_DATA_MP3) if "Identity_unknown" not in x]
window_root_names = ["_".join(x.split("_")[:-1]) for x in os.listdir("./data/mp3_windows")]
desired_files = [x for x in desired_files if x[:-4] not in window_root_names]
num_samples = len(desired_files)
c = 0
print(f"starting with {desired_files[1]}")
print(f"{num_samples} files to convert")

for filename in desired_files:
    try:
        if filename.endswith(".mp3"):
            c+=1
            if c % 25 == 0:
                print(round(c/num_samples, 5))
            sr, numpy_audio_array = mp3_to_np(PATH_TO_DATA_MP3 + "/" + filename)
            if len(numpy_audio_array.shape) == 2:
                numpy_audio_array = np.mean(numpy_audio_array, axis=1)
            windows = extract_best_windows(numpy_audio_array, sr, max_power=0)
            for i in range(0, len(windows)):
                np_to_mp3("./data/mp3_windows/" + filename[:-4] + f"_{i}.mp3", sr, windows[i])
    except:
        print(filename + " failed to convert")

starting with Abbott's_Babbler_154.mp3
3654 files to convert
0.00684
0.01368
0.02053
0.02737
0.03421
0.04105
0.04789
0.05473
0.06158
0.06842
0.07526
0.0821
0.08894
0.09579
0.10263
0.10947
0.11631
Common_Waxbill_176.mp3 failed to convert
0.12315
0.12999
0.13684
0.14368
0.15052
0.15736
0.1642
0.17105
0.17789
0.18473
0.19157
0.19841
0.20525
Indian_Paradise_Flycatcher_13.mp3 failed to convert
0.2121
0.21894
0.22578
0.23262
0.23946
0.24631
0.25315
0.25999
0.26683
0.27367
0.28051
0.28736
0.2942
0.30104
0.30788
0.31472


### Mp3 To Wav

In [None]:
PATH_TO_DATA_WAV='./data/wav_windows"
PATH_TO_MP3_WINDOWS = "./data/mp3_windows"

convert_mp3s_to_wav(os.listdir(PATH_TO_MP3_WINDOWS),PATH_TO_MP3_WINDOWS+"/", PATH_TO_DATA_WAV+"/")

### Denoise all wav files

In [16]:
WAV_PATH = PATH_TO_DATA_WAV
DEST_PATH = "data/dn_wav_windows"
for filename in os.listdir(WAV_PATH):
    if filename.endswith(".wav"):
        # load data
        data, sr = librosa.load(WAV_PATH +"/"+ filename, sr=None)
        # perform noise reduction
        reduced_noise = nr.reduce_noise(y=data, sr=sr)
        wavfile.write(DEST_PATH +"/"+ filename[:-4] + "dn.wav", sr, reduced_noise)
        

### Wav Window To Spectrogram 

In [None]:
PATH_TO_SPECTRO = '/data/spectrograms"

get_spectro_from_wav(os.listdir(PATH_TO_DATA_WAV),PATH_TO_DATA_WAV+"/",PATH_TO_SPECTRO+"/")



#### Sort data into training and validation splits. Testing data will come from real recordings

In [14]:
import random
import shutil

SPEC_PATH = "data/denoised_spectrogram"
TRAIN_PATH = "data/train"
VAL_PATH = "data/validation"
TRAIN_RATIO = .9

if not os.path.exists(TRAIN_PATH):
    os.mkdir(TRAIN_PATH)
if not os.path.exists(VAL_PATH):
    os.mkdir(VAL_PATH)
    
for directory in os.listdir(SPEC_PATH):
    if not os.path.exists(TRAIN_PATH+"/"+directory):
        os.mkdir(TRAIN_PATH+"/"+directory)
    if not os.path.exists(VAL_PATH+"/"+directory):
        os.mkdir(VAL_PATH+"/"+directory)
    file_list = os.listdir(SPEC_PATH+"/"+directory)
    random.shuffle(file_list)
    t_idx = int(len(file_list)*TRAIN_RATIO)
    train_files = file_list[:t_idx]
    val_files = file_list[t_idx:]
    train_dest = TRAIN_PATH+"/"+directory
    val_dest = VAL_PATH+"/"+directory
    for filename in train_files:
        if not filename.endswith(".ipynb_checkpoints"):
            shutil.copy(SPEC_PATH+"/"+directory+"/"+filename, train_dest)
    for filename in val_files:
        if not filename.endswith(".ipynb_checkpoints"):
            shutil.copy(SPEC_PATH+"/"+directory+"/"+filename, val_dest)
            

### Make denoised spectrograms of testing data

In [2]:
wav_file_path = "data/costa_rica/test_wav/Bird_1dn.wav"
export_path = "data/costa_rica/test/"
file_name_root = "Bird1dn"
convert_long_wav_to_spectro_windows(wav_file_path, export_path, file_name_root, window_size=5)