In [1]:
import numpy as np
import math
import pandas as pd
import librosa as lb
import librosa.display
import soundfile as sf
import matplotlib.patches as patch
import matplotlib.pyplot as plt
import IPython.display as ipd
from pathlib import Path
from tqdm.notebook import tqdm
from scipy.io import wavfile
import time
import os
import shutil
from skimage import io
from skimage.color import rgb2gray,rgba2rgb
#https://github.com/iver56/audiomentations ?
%matplotlib inline
CC_ROOT=Path("D:\\KidElectric\\rfcx_kaggle") #Cookiecutter datascience-style project
CC_DATA_OUT=CC_ROOT.joinpath('data').joinpath('interim')

DATA_ROOT = CC_ROOT.joinpath('data')
TRAIN_AUDIO_ROOT = DATA_ROOT.joinpath('raw').joinpath('train')#Update to point to cookiecutter data/raw/train
TEST_AUDIO_ROOT = DATA_ROOT.joinpath('raw').joinpath('test')#Update to point to cookiecutter data/raw/test


df_train = pd.DataFrame({
    "recording_id": [path.stem for path in TRAIN_AUDIO_ROOT.glob("*.flac")],
})

df_test = pd.DataFrame({
    "recording_id": [path.stem for path in TEST_AUDIO_ROOT.glob("*.flac")],
})

df_tp=pd.read_csv(CC_ROOT.joinpath('references').joinpath('train_tp.csv')).set_index('recording_id')

df_fp=pd.read_csv(CC_ROOT.joinpath('references').joinpath('train_fp.csv')).set_index('recording_id')


In [2]:
class params:
    """
    Parameters used for generating spectrograms from audio data
    """
    sr = 48000 #sampling rate

    # Melspectrogram
    n_mels = 128
    fmin = 80
    fmax = 14000  
    n_fft=2048
    hop_length = 512

def load_audio(pnfn="", sr=48000):
    y, _ = lb.load(pnfn,
        sr=sr, 
    )
    return y

def compute_melspec(y, params):
    """
    Computes a mel-spectrogram and puts it at decibel scale
    Arguments:
        y {np array} -- signal
        params {AudioParams} -- Parameters to use for the spectrogram. Expected to have the attributes sr, n_mels, f_min, f_max
    Returns:
        np array -- Mel-spectrogram
    """
    melspec = lb.feature.melspectrogram(
        y, sr=params.sr, n_mels=params.n_mels,
        fmin=params.fmin, fmax=params.fmax, center=True, pad_mode='reflect',
        hop_length=params.hop_length,n_fft=params.n_fft
    )
    melspec = lb.power_to_db(melspec).astype(np.float32)
    return melspec

def has_tp_fp(rec,df_tp,df_fp):
    #Many recordings have both tp and fp data labeled.
    has_tp=False
    has_fp=False
    if rec in df_tp.index:
        has_tp = True
    if rec in df_fp.index:
        has_fp = True
    return has_tp, has_fp

def clip_info(rec,df):
    keep={'species_id':[],'t_min':[],'t_max':[]}
    for key in df.keys():
        if key in keep.keys():
            if isinstance(df[key][rec],pd.core.series.Series):
                keep[key]=[val for val in df[key][rec].values]
            else:
                keep[key].append(df[key][rec])
    return keep

def clip_identity(rec,df_tp,df_fp,start,stop):
    out={'tp_spec':[],'tp':0,
         'fp_spec':[],'fp':0}
    has_fp=False
    has_tp=False
    if rec in df_tp.index:
        info = clip_info(rec,df_tp)
        for i,spec in enumerate(info['species_id']):
            t_min=info['t_min'][i]
            t_max=info['t_max'][i]
            overlap=(t_min < stop and t_min >= start) or (t_max >= start and t_max < stop)
            if overlap == True:
                has_tp=True
                out['tp_spec'].append(spec)
                out['tp']+=1
    
    if rec in df_fp.index:
        info = clip_info(rec,df_fp)
        for i,spec in enumerate(info['species_id']):
            t_min=info['t_min'][i]
            t_max=info['t_max'][i]
            overlap=(t_min < stop and t_min >= start) or (t_max >= start and t_max < stop)
            if overlap == True:
                has_fp=True
                out['fp_spec'].append(spec + 24)
                out['fp']+=1   
    return out


        


# thoughts
* turn each training clip into 1000 spectra (using large hop_length)
* take 100 from (FP or TP) vs. NP for each training clip (if available)
* Fit a moddel that predicts (FP or TP) from (NP) basically: does this clip contain something of interest?
* Run same analysis pipeline on test data, but then predict usefulness of all segments in clip.
* Train/test by taking top 256 clips from each
* OR: dimensionality reduction of some other kind
* OR: reduce size + create labels of all types of audio in clip for one-hot encoding
* either way, reduce training and testing set size to maximally informative spectra
    

In [4]:
60000/200

300.0

In [None]:
fn=df_tp.index[157]
take_med=False
clip_to_singer=False
num_ds=1000
src=TRAIN_AUDIO_ROOT.joinpath('0_raw').joinpath(fn).with_suffix(".flac")
dest=CC_DATA_OUT.joinpath('spec_ds%d' % num_ds)
if dest.exists()==False:
    os.mkdir(dest)
    
for mode in modes:
    if mode == 'TEST':
        resume=0
        use_df=df_test
        src=TEST_AUDIO_ROOT 
        params.dest=dest.joinpath('test')
    elif mode == 'TRAIN':
        resume=0
        use_df=df_train
        src=TRAIN_AUDIO_ROOT
        params.dest=dest.joinpath('train')
    params.use_root=use_root
       
    if params.dest.exists() == False:
        os.makedirs(params.dest)
    for ii,fn in enumerate(use_df['recording_id'][resume:]):
        pnfn=src.joinpath(fn).with_suffix(".flac")
        _,fs=sf.read(pnfn)
        
        params.sr=fs
        params.mode=mode

        print('\n%d of %d. Loading...' % (resume+ii, len(use_df['recording_id'])))        
        params.fn=fn
        y = load_audio(pnfn, params.sr)
        params.hop_length=math.floor((fs*60)/num_ds)
        print('\tLoaded.')
        params.rec_length=int(y.shape[0]/params.sr)

        # Create spectrogram and relate to 
        melspec=compute_melspec(y,params)
    
        # Label each of these segments
        spec=img.split('_')[1]
        rec=img.split('_')[2]
        rec_part=img.split('_')[3].split('.')[0]
        clip_start=float(rec_part)*params.clip_dur
        clip_stop= clip_start + params.clip_dur
        info = clip_info(rec,df_tp)
        for i,s in enumerate(info['species_id']):
            if s == int(spec):
                t_min=info['t_min'][i]
                t_max=info['t_max'][i]
                if (t_max > clip_start) and (t_min < clip_stop):
                    #This call is within this clip.
                    if ((t_max - clip_start) < params.min_dur) or ((clip_stop - t_min) < params.min_dur):
                        #But only the very end or beginning of the call is in the clip
                        shutil.move(str(fn),str(params.dest)) #Move to a separate folder
                        print('%d) %s is %1.3f s == too short!' % (ii,img,np.min([(clip_stop - t_min),(t_max - clip_start)])))

                        
for i
_,fs=sf.read(pnfn)
singer=df_tp['species_id'][fn]
params.sr=fs


y = load_audio(fn, params.sr, TRAIN_AUDIO_ROOT.joinpath('0_raw'))
params.hop_length=round(len(y)/1200)
print(y.shape[0]/fs)
start_t=0
# end_t=round(60 * fs)
end_t=round(0.2 * fs)
if clip_to_singer == True:
    if isinstance(singer,np.int64):
        start_t=round(df_tp['t_min'][fn]*fs)
        end_t= round(df_tp['t_max'][fn]*fs)
    else:
        start_t=round(df_tp['t_min'][fn][0]*fs)
        end_t=round(df_tp['t_max'][fn][0]*fs)

print((singer,fs,y.shape[0]/fs))
keep = compute_melspec(y, params)
y=y[start_t:end_t]
melspec=compute_melspec(y,params)