# Whistler trace extraction
Extract the whistler traces from the .vr2 files and store it in the hdf5 database

In [1]:
import os
import sys
import h5py
import numpy as np
import pandas as pd
import datetime

In [2]:
# database full path
database_name = 'whistlers.h5'
database_location = os.path.join(os.getcwd().split(os.environ.get('USER'))[0],os.environ.get('USER'), 'Data')
database_path = os.path.join(database_location,database_name)

# data variables
awd_events = 2
sites = ['marion', 'sanae']

In [3]:
def frread(fname=None):
    """ This is a rough translation of frread.m from J. Lichtenberger for the
    stereo=True case, i.e. we assume orthogonal loop antenna.
    inputs
        fname (string): File name path to the .vr2 file to load
    outputs
        wh (ndarray): 2xN array with the two traces in the first and second rows.
    """
    # open file for reading
    fid = open(fname, 'rb')
    # get data from file - 16-bit signed integers
    dat = np.fromfile(fid, dtype=np.int16)
    # length of one frame
    frLen = 4103  ## not sure how this is determined
    # number of frames to read
    nFrameRead = len(dat) / frLen
    # data length of frame
    adatlen = 2048
    # length of data set
    N = int(nFrameRead * adatlen)
    wh = np.zeros((N, 2), dtype=float)
    # for every frame
    for i in np.arange(0, nFrameRead, dtype=int):
        # indices for first component
        i1 = np.arange(7 + i * frLen, (i + 1) * frLen, 2, dtype=int)
        # indices for second component
        i2 = np.arange(8 + i * frLen, (i + 1) * frLen + 0, 2, dtype=int)
        ii = np.arange(i * adatlen, (i + 1) * adatlen, dtype=int)
        wh[ii, 0] = dat[i1]
        wh[ii, 1] = dat[i2]
#     print(len(np.arange(0, nFrameRead, dtype=int)))
    return wh

In [4]:
def vr2_to_panda(dir_name,fname, site):
    """Extract the data from a file a store it as a Panda DataFrame
    inputs
        fname    file name
        site     name of the site where data was collected
    outputs 
        whdf     dataframe containing the signal received by the NS and EW pointitng
                    orthogonal loop antennas
        fs       sampling frequency
        t0       start time
        t1       end time
    """
    # read vr2 file
    wh = frread(os.path.join(dir_name,fname))
    
    # CONSTANTS
    # Sampling frequency (20kHz for SANAE, 40kHz for MARION )
    fs = 2e4 if site=="sanae" else 4e4
    # time step in microseconds (for dataframe index)
    dt = 1e6 / fs

    # Set the date/time format in the filename
    # dtFormat = '%Y-%m-%dUT%H_%M_%S.%f'
    dtFormat = '%Y-%m-%dUT%H:%M:%S.%f'

    # Set up pandas dataframe
    # Start time
    t0 = pd.datetime.strptime(fname[0:27], dtFormat)
    # Number of samples
    Nsamples = len(wh[:, 0])
    # End time
    t1 = t0 + datetime.timedelta(0, 0, Nsamples * dt)
    # Create index
    tindex = pd.date_range(start=t0, periods=Nsamples, freq='50U') # freq = 50us

    # Create pandas data frame from wh
    whdf = pd.DataFrame(index=tindex, data=wh[:, 0], columns=['X'])
    whdf['Y'] = wh[:, 1]
    # The 'X' and 'Y' columns are the signal received by the North/South and
    # East/West pointing orthogonal loop antennas used at Marion and SANAE
    
    return whdf, fs

In [5]:
for awd_event in range(1,awd_events):
    for site in sites:
        f = h5py.File(database_path, 'r+')
        data_location = os.path.join(database_location, 'awdEvents'+str(awd_event), site, site+'_data')
        if os.path.exists(data_location):
            files = [ file for file in os.listdir(data_location) if file.endswith('.vr2')] # only select .vr2 file
            print('\nGenerating whistler traces for %s/%s' %('awdEvent'+str(awd_event),site))
            last_percent = None
            num_file = 0
            for file in files:
                whdf, fs =  vr2_to_panda(data_location, file, site)
                whistler_grp = f[os.path.join('awdEvents'+str(awd_event),site,'whistler traces')]
                file_dataset = whistler_grp.create_dataset(file,whdf.shape,np.float32, compression="gzip", data=whdf)
                file_dataset.attrs['sample frequency'] = fs
                percent = int(num_file*100/len(files))
                if last_percent != percent:
                    if percent%10==0:
                        sys.stdout.write("%s%%" % percent)
                        sys.stdout.flush()
                    else:
                        sys.stdout.write(".")
                        sys.stdout.flush()
                    last_percent = percent
                num_file+=1
        f.close()


Generating whistler traces for awdEvent1/marion
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........
Generating whistler traces for awdEvent1/sanae
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........

In [9]:
f = h5py.File(database_path, 'r+')
whistler_grp = f[os.path.join('awdEvents'+str(awd_event),site,'whistler traces')]
print(whistler_grp.keys())
f.close()


<KeysViewHDF5 ['2012-02-29UT20:27:45.81215195.sanae.vr2', '2012-03-03UT04:49:14.82335188.sanae.vr2', '2012-03-15UT05:14:54.89958109.sanae.vr2', '2012-03-15UT05:25:45.44678109.sanae.vr2', '2012-03-16UT05:04:06.29798109.sanae.vr2', '2012-03-16UT07:55:28.28198109.sanae.vr2', '2012-03-16UT08:14:48.98598109.sanae.vr2', '2012-03-22UT16:17:18.79718109.sanae.vr2', '2012-03-23UT21:25:51.38598109.sanae.vr2', '2012-03-23UT23:56:19.99398109.sanae.vr2', '2012-03-28UT05:09:21.28038109.sanae.vr2', '2012-03-31UT05:46:52.33958117.sanae.vr2', '2012-04-11UT08:15:57.69638109.sanae.vr2', '2012-04-23UT04:09:57.96518125.sanae.vr2', '2012-04-27UT02:35:32.78758109.sanae.vr2', '2012-04-27UT02:36:15.18118109.sanae.vr2', '2012-04-29UT05:11:40.64678117.sanae.vr2', '2012-05-06UT02:32:15.15558133.sanae.vr2', '2012-05-06UT03:08:19.78918125.sanae.vr2', '2012-05-06UT04:25:08.60838125.sanae.vr2', '2012-05-06UT04:25:08.81318125.sanae.vr2', '2012-05-06UT07:39:17.12038133.sanae.vr2', '2012-05-06UT07:39:24.69798125.sanae.vr