# Store outputs in a database
Each output database contains the follwing:
* events attribute: the number of event in the datafile
* a 2D array with:
    * row 1: the time difference (in ms) between the event and the start of the reccording (found in the file's name)
    * row 2: probability associated with the event

In [1]:
import os
import sys
import h5py
import numpy as np
import pandas as pd
import datetime

In [2]:
# database full path
database_name = 'whistlers.h5'
database_location = os.path.join(os.getcwd().split(os.environ.get('USER'))[0],os.environ.get('USER'), 'wdml', 'Data')
database_path = os.path.join(database_location,database_name)

# data variables
awd_events = 2
sites = ['marion', 'sanae']

In [3]:
def extract_output(data_root, awd_event, site):
    """Extract the output information for each file
    inputs
        data_root   location of the data
        site        site where data was collected
    outputs
        dataset     dictionary mapping each file with the whistler location
    """
    output_path = os.path.join(data_root,site)
    output_file = None
    for file in os.listdir(output_path):
        if file.endswith('.out'):
            output_file = file
            break
    try:
        os.path.exists(output_file)
        with open(os.path.join(output_path, output_file), 'r') as f:
            dataset = {}
            num_line = 0
            lines = f.readlines()
            file_list = []
            last_percent = None
            print('\nGenerating outputs for %s/%s' %('awdEvent'+str(awd_event),site))
            for line in lines:
                event = {}
                line = line.split('\n') # Remove the '\n' character from each line
                line = line[0].split(' ') 
                line = list(filter(None, line)) # discard empty element in array
                for index in range(2,len(line),2): # store event and probabilities in a dictionary
                    event[line[index]]=line[index+1]
                # save the dictionary
                if line[1] not in file_list: # if file name not in the list
                    dataset[line[1]]=event
                    file_list.append(line[1])
                else:
                    data = dataset[line[1]]
                    event.update(data)
                    dataset[line[1]]=event
                # print progression
                percent = int(num_line*100/len(lines))
                if last_percent != percent:
                    if percent%5==0:
                        sys.stdout.write("%s%%" % percent)
                        sys.stdout.flush()
                    else:
                        sys.stdout.write(".")
                        sys.stdout.flush()
                    last_percent = percent
                num_line+=1
    except Exception as e:
        print('Error:', e)
    return dataset

In [4]:
def datetime_to_unit(datatime):
    times = datatime.split('UT')
    h, m, ss = times[-1].split(':')
    s, u = ss.split('.')
    return [h,m,s,u]
def datetime_to_ms(datetime):
    datetime = datetime_to_unit(datetime)
    datetime[0] = float(datetime[0])*60*60*1000
    datetime[1] = float(datetime[1])*60*1000
    datetime[2] = float(datetime[2])*1000
    datetime[3] = float(datetime[3])/10**(len(datetime[3]))
    return sum(datetime)
def datetime_diff(datetime2, datetime1):
    return datetime_to_ms(datetime2)-datetime_to_ms(datetime1)

In [9]:
for awd_event in range(1,awd_events):
    for site in sites:
        f = h5py.File(database_path, 'a')
        f.require_group('awdEvents%s/%s/outputs' % (str(awd_event), site))    
        grp = f[os.path.join('awdEvents'+str(awd_event),site,'outputs')]
        # find output file
        output_path = os.path.join(database_location, 'awdEvents'+str(awd_event), site)
        output_file = None
        for file in os.listdir(output_path):
            if file.endswith('_no_duplicate.out'):
                output_file = file
                break        
        # extract output
        try:
            os.path.exists(output_file)
            with open(os.path.join(output_path, output_file), 'r') as f:
                lines = f.readlines()
                file_list = []
                last_percent = None
                num_file = 0
                print('\nGenerating outputs for %s/%s' %('awdEvent'+str(awd_event),site))
                for line in lines:
                    line = line.split('\n') # Remove the '\n' character from each line
                    line = line[0].split(' ')
                    line = list(filter(None, line)) # discard empty element in array
                    file = line[1]
                    file_time = file[:27]
                    output = []
                    for index in range(2,len(line),2): # store event and probabilities in a dictionary
                        output.append([round(datetime_diff(line[index],file_time),5),float(line[index+1])])
                    # sort output based on time
                    output = sorted(output, key=lambda x:x[0])
                    output = np.asarray(output)
                    # save the dictionary
                    file_dataset = grp.create_dataset(file,output.shape,np.float32, compression="gzip", data=output)
                    file_dataset.attrs['events'] = len(output[:,0])
                    
                    percent = int(num_file*100/len(lines))
                    if last_percent != percent:
                        if percent%10==0:
                            sys.stdout.write("%s%%" % percent)
                            sys.stdout.flush()
                        else:
                            sys.stdout.write(".")
                            sys.stdout.flush()
                        last_percent = percent
                    num_file+=1
                    
        except Exception as e:
            print('Error:', e) 
        f.close()


Generating outputs for awdEvent1/marion
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Generating outputs for awdEvent1/sanae
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........