In [1]:
import os
import sys
import h5py
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from scipy import signal
from scipy.stats import norm, pearsonr
from itertools import combinations
%matplotlib inline

In [2]:
# database full path
database_name = 'whistlers.h5'
database_location = os.path.join(os.getcwd().split(os.environ.get('USER'))[0],os.environ.get('USER'), 'wdml', 'Data')
database_path = os.path.join(database_location,database_name)

# data variables
awd_events = 2
sites = ['marion', 'sanae']

In [3]:
def extract_spectrogram_info(spectrogram_data):
    '''Extract the time, frequency axis values as well as the 
            spectrogram data.
    inputs:
        spectrogram_data: the spectrogram data including the 
        time and frequency information.
    outputs: 
        time: time values
        frequency: frequency values
        spectrogram: spectrogram
    '''
    time = spectrogram_data[0,1:]
    frequency = spectrogram_data[1:,0]
    spectrogram = spectrogram_data[1:,1:]
    return time, frequency, spectrogram

def reshape_spectrogram(f, t, s):
    f = np.asarray(f)
    t = np.asarray(t)
    s = np.asarray(s)
    _t = np.concatenate(([0],t))
    _s = np.concatenate((f[np.newaxis].T,s), axis=1)
    sft = np.vstack((_t,_s))
    return sft

In [None]:
nb_no_event = np.array([0,0])
nb_event = np.array([0,0])
nb_file = np.array([0,0])
for awd_event in range(1,awd_events):
    for site in range(len(sites)):
        f  = h5py.File(database_path, 'r+')
        grp_wh = f[os.path.join('awdEvents'+str(awd_event), sites[site],'spectrograms')]
        grp_split = f.require_group(os.path.join('awdEvents'+str(awd_event), sites[site],'split_dataset'))
        files = list(grp_wh.keys())
        nb_file[site] = len(files)
        # file = files[np.random.randint(len(files))] # select a random sample
        # file = '2013-07-29UT14:22:21.36931914.marion.vr2'
        print('\nGenerating split dataset for %s/%s' %('awdEvent'+str(awd_event),sites[site]))
        last_percent = None
        num_file = 0
        for num_file in range(len(files)):
            file_index = 0
            file = files[num_file]
            # print(file)
            # create np array to store sample information
            data = np.empty(grp_wh[file].shape)
            grp_wh[file].read_direct(data)
            outputs = grp_wh[file].attrs['output']
            # extract data info
            time, freq, Sxx = extract_spectrogram_info(data)
            _t = np.round_(time,decimals=3)
            index = 0
            indices = []
            for output in outputs:
                event_time = np.round(output[0]/1000,3)
                # find the index in the time 
                index = min(range(len(_t)), key=lambda i: abs(_t[i]-event_time))
                # only process if the index found is new
                if index!=0 and index not in indices:
                    indices.append(index)

            # add probabilities to evetns
            events = sorted(outputs[:,1], reverse=True)[:len(indices)] # map prob to to event correctly

            # add last index of time to indices and permute the indices into sets of 2
            indices.append(time.shape[0])
            event_indices = list(combinations(indices,2))

            # get eventless section of spectrogram
            Sxx_no_event, dump = np.hsplit(Sxx, [indices[0]])
            t_no_event, dump = np.hsplit(time, [indices[0]])
            # add to dataset
            spec_data = reshape_spectrogram(freq, t_no_event, Sxx_no_event)
            file_name = file.split('.vr2')[0]+'_'+ str(file_index)+'.vr2'
            file_dataset = grp_split.create_dataset(file_name,spec_data.shape,np.float32, compression="gzip", data=spec_data)
            file_dataset.attrs['event'] = False
            nb_no_event[site] += 1
            
            # for each set of events
            for i in event_indices:
                # extract events
                file_index += 1
                Sxx_event = Sxx[:,i[0]:i[1]]
                t_event = time[i[0]:i[1]]
                spec_data = reshape_spectrogram(freq, t_event, Sxx_event)
                # create dataset
                file_name = file.split('.vr2')[0]+'_'+ str(file_index)+'.vr2'
                file_dataset = grp_split.create_dataset(file_name,spec_data.shape,np.float32, compression="gzip", data=spec_data)
                file_dataset.attrs['event'] = True
                nb_event[site] += 1
            
            # print progress
            percent = int(num_file*100/len(files))
            if last_percent != percent:
                if percent%10==0:
                    sys.stdout.write("%s%%" % percent)
                    sys.stdout.flush()
                else:
                    sys.stdout.write(".")
                    sys.stdout.flush()
                last_percent = percent
        f.close()
        
print('Number of datasets without event:', nb_no_event.sum())
print('Number of datasets with event:', nb_event.sum())
print('Number of datasets :', nb_file.sum())