In [1]:
import numpy as np
import pydub
import requests
import os
import json
import noisereduce as nr
import librosa
import librosa.display
from scipy.io import wavfile
import matplotlib.pyplot as plt
from PIL import Image
from PreprocessingFunctions import *

#### Download select mp3 files for local machine testing

In [None]:
PATH = "./data/mp3/"

for filename in os.listdir("./data"):
    if filename.endswith(".json"):
        with open("./data/" + filename, 'r') as speciesFile:
            species_data = json.load(speciesFile)
            for i in range(0, len(species_data)):
                d = species_data[i]
                if d["file-name"].endswith(".mp3") and d["smp"]=="48000" and d["q"] != "E" and d["q"] != "F":
                    mp3_url = d["file"]
                    r = requests.get(mp3_url, allow_redirects=True)
                    open(PATH + filename[:-5] + f"_{i}.mp3", 'wb').write(r.content)

#### Make data windows

In [2]:
PATH_TO_DATA_MP3 = "./data/mp3"
desired_files = [x for x in os.listdir(PATH_TO_DATA_MP3) if "Identity_unknown" not in x]
window_root_names = ["_".join(x.split("_")[:-1]) for x in os.listdir("./data/mp3_windows")]
desired_files = [x for x in desired_files if x[:-4] not in window_root_names]
num_samples = len(desired_files)
c = 0
print(f"starting with {desired_files[1]}")
print(f"{num_samples} files to convert")

for filename in desired_files:
    try:
        if filename.endswith(".mp3"):
            c+=1
            if c % 25 == 0:
                print(round(c/num_samples, 5))
            sr, numpy_audio_array = mp3_to_np(PATH_TO_DATA_MP3 + "/" + filename)
            if len(numpy_audio_array.shape) == 2:
                numpy_audio_array = np.mean(numpy_audio_array, axis=1)
            windows = extract_best_windows(numpy_audio_array, sr, max_power=0)
            for i in range(0, len(windows)):
                np_to_mp3("./data/mp3_windows/" + filename[:-4] + f"_{i}.mp3", sr, windows[i])
    except:
        print(filename + " failed to convert")

### Mp3 To Wav

In [2]:
PATH_TO_DATA_WAV= "./data/wav_windows"
PATH_TO_MP3_WINDOWS = "./data/mp3_windows"
current_wav_windows = os.listdir(PATH_TO_DATA_WAV)
desired_files = [x for x in os.listdir(PATH_TO_MP3_WINDOWS) if x.replace(".mp3",".wav") not in current_wav_windows]
print(desired_files[:2])

convert_mp3s_to_wav(desired_files, PATH_TO_MP3_WINDOWS+"/", PATH_TO_DATA_WAV+"/")

['Eurasian_Tree_Sparrow_1692_9.mp3', 'House_Sparrow_1314_31.mp3']
Eurasian_Tree_Sparrow_1692_9.mp3 failed to convert


### Denoise all wav files

In [2]:
WAV_PATH = PATH_TO_DATA_WAV
DEST_PATH = "data/dn_wav_windows"
all_wav_files = os.listdir(WAV_PATH)
current_denoised_files = os.listdir(DEST_PATH)
desired_files = [x for x in all_wav_files if x.replace(".wav", "dn.wav") not in current_denoised_files]
total_files = len(desired_files)
c=0
print(f"Starting with {desired_files[:3]}")

for filename in desired_files:
    if filename.endswith(".wav"):
        try:
            # load data
            data, sr = librosa.load(WAV_PATH +"/"+ filename, sr=None)
            # perform noise reduction
            reduced_noise = nr.reduce_noise(y=data, sr=sr)
            wavfile.write(DEST_PATH +"/"+ filename[:-4] + "dn.wav", sr, reduced_noise)
            c+=1
        except:
            print(f"{filename} failed to denoise")
        if c % 100 == 0:
            print(round(c/total_files, 5))
        

### Wav Window To Spectrogram 

In [2]:
DN_WAV_PATH =  "data/dn_wav_windows"
PATH_TO_SPECTRO = "data/spectrograms"
all_wav_files = os.listdir(DN_WAV_PATH)
current_spectros = os.listdir(PATH_TO_SPECTRO)
desired_files = [x for x in all_wav_files if x.replace(".wav", ".jpg") not in current_spectros]
total_files = len(desired_files)
print(f"Starting with {desired_files[:3]}. {total_files} wav files to convert in total.")

get_spectro_from_wav(desired_files, DN_WAV_PATH+"/",PATH_TO_SPECTRO+"/")



Starting with ['White_Wagtail_156_2dn.wav', 'White_Wagtail_156_3dn.wav', 'White_Wagtail_156_4dn.wav']. 4399 wav files to convert in total.
0.05683
0.11366
0.17049
0.22732
0.28416
0.34099
0.39782
0.45465
0.51148
0.56831
0.62514
0.68197
0.7388
0.79564
0.85247
0.9093
0.96613


#### Sort Spectrograms into species sub-folders

In [3]:
import shutil
audio_clips = os.listdir("data/spectrograms")
export_path = "data/sorted_spectrograms/"

for filename in audio_clips:
    if filename.endswith(".jpg"):
        dir_name = "_".join(filename.split("_")[:-2])
        dir_path = export_path+dir_name
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        dest_path = dir_path + "/" + filename
        shutil.copy("data/spectrograms/"+filename, dest_path)
    

#### Sort data into training and validation splits. Testing data will come from real recordings

In [4]:
import random
import shutil


SPEC_PATH = "data/sorted_spectrograms"
TRAIN_PATH = "data/train"
VAL_PATH = "data/validation"
TRAIN_RATIO = .9

if not os.path.exists(TRAIN_PATH):
    os.mkdir(TRAIN_PATH)
if not os.path.exists(VAL_PATH):
    os.mkdir(VAL_PATH)
    
for directory in os.listdir(SPEC_PATH):
    if not os.path.exists(TRAIN_PATH+"/"+directory):
        os.mkdir(TRAIN_PATH+"/"+directory)
    if not os.path.exists(VAL_PATH+"/"+directory):
        os.mkdir(VAL_PATH+"/"+directory)
    file_list = os.listdir(SPEC_PATH+"/"+directory)
    random.shuffle(file_list)
    t_idx = int(len(file_list)*TRAIN_RATIO)
    train_files = file_list[:t_idx]
    val_files = file_list[t_idx:]
    train_dest = TRAIN_PATH+"/"+directory
    val_dest = VAL_PATH+"/"+directory
    for filename in train_files:
        if not filename.endswith(".ipynb_checkpoints"):
            shutil.copy(SPEC_PATH+"/"+directory+"/"+filename, train_dest)
    for filename in val_files:
        if not filename.endswith(".ipynb_checkpoints"):
            shutil.copy(SPEC_PATH+"/"+directory+"/"+filename, val_dest)
            

### Make denoised spectrograms of testing data

In [2]:
wav_file_path = "data/costa_rica/test_wav/Bird_1dn.wav"
export_path = "data/costa_rica/test/"
file_name_root = "Bird1dn"
convert_long_wav_to_spectro_windows(wav_file_path, export_path, file_name_root, window_size=5)

### Make Resized Copies of Data for Transfer Learning

In [4]:
downsample_images("./data/train_refined", "./data/small_train_179", min_num_files=30)

In [5]:
downsample_images("./data/validation_refined", "./data/small_validation_179", min_num_files=30)