# Create testing, validation, and testing sets for classifier

In [1]:
import sys
import os
import time

sys.path.insert(0,'../../../')
sys.path.insert(0,'../../../py')

import parameters
import utilities
import spectrogram_utilities
import output_utilities
import spectrogram_output_visualiser
import spectrogram_cuts_db_creation

import numpy as np
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# matplotlib.get_backend()
%matplotlib inline

awd_event = 1
site = parameters.sites[0]

# Training and Testing using k-fold cross validation

In [3]:
for site in parameters.sites:
    # get all files from the site
    files = utilities.all_files(awd_event, site)
    training_files, test_files = train_test_split(files, test_size=0.33, random_state=42)

    # create dictionary to save 
    sets_name = 'training_test_data_'+site+'.pickle'
    sets = {}
    sets['training']=training_files
    sets['testing']=test_files

    # save object
    pickle.dump(sets, open(os.path.join(parameters.hyp5_location,sets_name), 'wb'))

    # load training, validation, and testing sets
    sets = pickle.load(open(os.path.join(parameters.hyp5_location,sets_name), 'rb'))

    # training sets 
    database_name='training_set_'+site+'.h5'
    spectrogram_cuts_db_creation.spectrogram_cuts_db(awd_event, site, sets['training'], database_name, threshold=0, verbose=True, force=True)

    # training sets 
    database_name='testing_set_'+site+'.h5'
    spectrogram_cuts_db_creation.spectrogram_cuts_db(awd_event, site, sets['testing'], database_name, threshold=0, verbose=True, force=True)


Generating whistler and noise cuts database for awdEvent1/marion
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Runtime: 140.27 seconds

Generating whistler and noise cuts database for awdEvent1/marion
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Runtime: 71.79 seconds

Generating whistler and noise cuts database for awdEvent1/sanae
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Runtime: 9.98 seconds

Generating whistler and noise cuts database for awdEvent1/sanae
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Runtime: 5.06 seconds


In [4]:
sets = ['training','testing']
# load dataset from every site
for site in parameters.sites:
    print('Site: ', site)
    for s in sets:
        data, pb, evt, f_cut_length, t_cut_length = spectrogram_cuts_db_creation.load_spectrogram_cuts_db(awd_event, site, s+'_set_'+site+'.h5', verbose=True, noise=True)
        freq = np.bincount(evt)
        print('Number of sample in %s set for %s is %d.' % (s, site, np.sum(freq)))
        pct = freq*100/np.sum(freq)
        print('Noise: %s, Event: %s' % (pct[0],pct[1]))


Site:  marion

Loading spectrogram cuts from database for awdEvent1/marion
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Runtime: 22.15 seconds
Number of sample in training set for marion is 19804.
Noise: 57.99838416481519, Event: 42.00161583518481

Loading spectrogram cuts from database for awdEvent1/marion
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Runtime: 10.49 seconds
Number of sample in testing set for marion is 9716.
Noise: 57.4207492795389, Event: 42.5792507204611
Site:  sanae

Loading spectrogram cuts from database for awdEvent1/sanae
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Runtime: 2.59 seconds
Number of sample in training set for sanae is 2289.
Noise: 49.84709480122324, Event: 50.15290519877676

Loading spectrogram cuts from database for awdEvent1/

# Per Site

## Training, Validation, and Testing set

In [None]:
# get all files from the site
files = utilities.all_files(awd_event, site)
np.random.shuffle(files)

# split files into training, validation, and test files using the ration of 70%-15%-15%
training_files = files[0:int(0.7*len(files))]
validation_files = files[int(0.7*len(files)):int(0.85*len(files))]
test_files = files[int(0.85*len(files)):]

# create dictionary to save 
sets_name = 'sets.pickle'
sets = {}
sets['training']=training_files
sets['validation']=validation_files
sets['testing']=test_files

# save object
pickle.dump(sets, open(os.path.join(parameters.hyp5_location,sets_name), 'wb'))

# print(len(files), len(training_files), len(validation_files), len(test_files))

## Database

### Creating database of whistlers and noise

In [None]:
# load training, validation, and testing sets
sets = pickle.load(open(os.path.join(parameters.hyp5_location,sets_name), 'rb'))

# training sets 
database_name='training_cuts.h5'
spectrogram_cuts_db_creation.spectrogram_cuts_db(awd_event, site, sets['training'], database_name, verbose=True)

# training sets 
database_name='validation_cuts.h5'
spectrogram_cuts_db_creation.spectrogram_cuts_db(awd_event, site, sets['validation'], database_name, verbose=True)

# training sets 
database_name='testing_cuts.h5'
spectrogram_cuts_db_creation.spectrogram_cuts_db(awd_event, site, sets['testing'], database_name, verbose=True)

### Loading database

In [None]:
# data, probs, events, f_cut_length, t_cut_length = spectrogram_cuts_db_creation.load_spectrogram_cuts_db(awd_event, site, database_name='spectrogram_cuts.h5', verbose=True)