# Data extraction
Extract spectrogram information from .vr2 files.

This program should be in the same directory as the root of the data. 

The data should be in a folder named ```<name_of_site>``` in which the .vr2 files are located in a ```<site_name>_<data>``` folder and the output saved as .out file


In [120]:
# Make sure that these module are installed
import os
import sys
import numpy as np
import pandas as pd
import datetime
import argparse
import matplotlib.pyplot as plt
from scipy import signal as signal
from six.moves import cPickle as pickle

In [None]:
# global variables

data_root = '.'
site = 'marion'

detrend='linear'
NFFT=512
noverlap=64
scale='dB'
scale_by_freq=False
cmap='jet'

In [None]:
def frread(fname=None):
    """ This is a rough translation of frread.m from J. Lichtenberger for the
    stereo=True case, i.e. we assume orthogonal loop antenna.
    inputs
        fname (string): File name path to the .vr2 file to load
    outputs
        wh (ndarray): 2xN array with the two traces in the first and second rows.
    """

    # open file for reading
    fid = open(fname, 'rb')
    # get data from file - 16-bit signed integers
    dat = np.fromfile(fid, dtype=np.int16)
    # length of one frame
    frLen = 4103  ## not sure how this is determined
    # number of frames to read
    nFrameRead = len(dat) / frLen
    # data length of frame
    adatlen = 2048
    # length of data set
    N = int(nFrameRead * adatlen)
    wh = np.zeros((N, 2), dtype=float)
    # for every frame
    for i in np.arange(0, nFrameRead, dtype=int):
        # indices for first component
        i1 = np.arange(7 + i * frLen, (i + 1) * frLen, 2, dtype=int)
        # indices for second component
        i2 = np.arange(8 + i * frLen, (i + 1) * frLen + 0, 2, dtype=int)
        ii = np.arange(i * adatlen, (i + 1) * adatlen, dtype=int)
        wh[ii, 0] = dat[i1]
        wh[ii, 1] = dat[i2]
#     print(len(np.arange(0, nFrameRead, dtype=int)))
    return wh

In [None]:
def vr2_to_panda(dir_name,fname, site):
    """Extract the data from a file a store it as a Panda DataFrame
    inputs
        fname    file name
        site     name of the site where data was collected
    outputs 
        whdf     dataframe containing the signal received by the NS and EW pointitng
                    orthogonal loop antennas
        fs       sampling frequency
        t0       start time
        t1       end time
    """
    # read vr2 file
    wh = frread(os.path.join(dir_name,fname))
    
    # CONSTANTS
    # Sampling frequency (20kHz for SANAE, 40kHz for MARION )
    fs = 2e4 if site=="sanae" else 4e4
    # time step in microseconds (for dataframe index)
    dt = 1e6 / fs

    # Set the date/time format in the filename
    # dtFormat = '%Y-%m-%dUT%H_%M_%S.%f'
    dtFormat = '%Y-%m-%dUT%H:%M:%S.%f'

    # Set up pandas dataframe
    # Start time
    t0 = pd.datetime.strptime(fname[0:27], dtFormat)
    # Number of samples
    Nsamples = len(wh[:, 0])
    # End time
    t1 = t0 + datetime.timedelta(0, 0, Nsamples * dt)
    # Create index
    tindex = pd.date_range(start=t0, periods=Nsamples, freq='50U') # freq = 50us

    # Create pandas data frame from wh
    whdf = pd.DataFrame(index=tindex, data=wh[:, 0], columns=['X'])
    whdf['Y'] = wh[:, 1]
    # The 'X' and 'Y' columns are the signal received by the North/South and
    # East/West pointing orthogonal loop antennas used at Marion and SANAE
    
    return whdf, fs, t0, t1

In [None]:
def spectrogram(data, fs):
    """Compute spectrogram from vr2 data collected
    inputs
        data       Pandas DataFrame of the vr2 data
        fs         Sampling frequency
    outputs
        data_info  dictionary of the frequencies, time, and spectrum of the sprectrogram
    """
#     spectrogram, frequencies, times, img = plt.specgram(data.X.values, Fs=fs, detrend=detrend, NFFT=NFFT , 
#                                                         noverlap=noverlap, scale=scale,
#                                                         scale_by_freq=scale_by_freq, cmap=cmap)
    frequencies, times, spectrogram = signal.spectrogram(data.X.values, fs=fs, detrend=detrend, nfft=NFFT , 
                                                        noverlap=noverlap, scaling='spectrum')
    data_info = {
        'frequencies':frequencies,
        'times':times,
        'spectrogram':spectrogram,
    }
    return data_info

In [None]:
def extract_data(data_root, site):
    """Extract all vr2 files in a dataset
    inputs
        data_root   location of data
        site        site where the data was collected
    outputs 
        dataset     dictionary containing the file name and their extracted
                    data
    """
    data_location = os.path.join(data_root, site, site+'_data')
    data_files = os.listdir(data_location)
    dataset = {}
    num_file = 0
    last_percent = None
    for file in data_files:
        if os.path.splitext(file)[1] == '.vr2':
            try:
                data, fs, t0, t1 = vr2_to_panda(data_location,file, site)
                data_info = spectrogram(data, fs)
                dataset[file]=data_info
                num_file += 1
            except Exception as e:
                print('Error:',e) 
        # print progression
        percent = int(num_file*100/len(data_files))
        if last_percent != percent:
            if percent%2==0:
                sys.stdout.write("%s%%" % percent)
                sys.stdout.flush()
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
            if percent>=98:
                print()
            last_percent = percent
    
    return dataset

In [None]:
def extract_output(data_root, site):
    """Extract the output information for each file
    inputs
        data_root   location of the data
        site        site where data was collected
    outputs
        dataset     dictionary mapping each file with the whistler location
    """
    output_path = os.path.join(data_root,site)
    output_file = None
    last_percent = None
    for file in os.listdir(output_path):
        if file.endswith('.out'):
            output_file = file
            break
    try:
        os.path.exists(output_file)
        with open(os.path.join(output_path, output_file), 'r') as f:
            dataset = {}
            num_line = 0
            lines = f.readlines()
            file_list = []
            for line in lines:
                event = {}
                line = line.split('\n') # Remove the '\n' character from each line
                line = line[0].split(' ') 
                line = list(filter(None, line)) # discard empty element in array
                for index in range(2,len(line),2): # store event and probabilities in a dictionary
                    event[line[index]]=line[index+1]
                # save the dictionary
                if line[1] not in file_list: # if file name not in the list
                    dataset[line[1]]=event
                    file_list.append(line[1])
                else:
                    data = dataset[line[1]]
                    event.update(data)
                    dataset[line[1]]=event
                # print progression
                percent = int(num_line*100/len(lines))
                num_line+=1
                if last_percent != percent:
                    if percent%2==0:
                        sys.stdout.write("%s%%" % percent)
                        sys.stdout.flush()
                    else:
                        sys.stdout.write(".")
                        sys.stdout.flush()
                    if percent>=98:
                        print()
                    last_percent = percent
    except Exception as e:
        print('Error:', e)
    return dataset

In [None]:
def build_dataset(data_root, site):
    """Extract the data from the vr2 files and the output file and
    save it as a file
    inputs
        data_root   location of the data
        site        site where data was collected
    output
        dataset
    """
    dataset, dataset_file, dataset_file_data = {}, {}, {}
    print('Start building dataset')
    print('Extracting data from site ', site)
    data = extract_data(data_root, site)
    print("%s data extracted" % site)
    print('Extracting output from site ', site)
    output = extract_output(data_root, site)
    print("%s output extracted" % site)
    print('Merging datasets')
    for file in data.keys():
        dataset_file_data['frequencies']=data[file]['frequencies']
        dataset_file_data['times']=data[file]['times']
        dataset_file_data['spectrogram']=data[file]['spectrogram']
        try:
            dataset_file_data['output']=output[file]
        except Exception as e:
            print(file, output[file])
            print(e)
        dataset_file[file]=dataset_file_data
    dataset['detrend']=detrend
    dataset['nfft']=NFFT
    dataset['noverlap']=noverlap
    dataset['data']=dataset_file
    print('Merge completed')
    return dataset

In [135]:
def save_dataset(data_root,site, dataset, extension='.dat'):
    """Save dataset to a binary file
    inputs
        data_root   location of the data
        site        site where data was collected
        dataset     dataset 
    """
    dataset_file = os.path.join(data_root, site+extension)
    try:
        f = open(dataset_file, 'wb')
        pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        f.close()
        print('Dataset saved to', dataset_file)
        print('Dataset size: ',os.stat(dataset_file).st_size)
    except Exception as e:
        print('Unable to save data to', pickle_file, ':', e)
        raise
        
def load_dataset(data_root, site, extension='.dat'):
    """load dataset 
    inputs
        data_root   location of the data
        site        site where data was collected
    outputs
        dataset
    """
    dataset_file = os.path.join(data_root, site+extension) 
    if os.path.exists(dataset_file):
        try: 
            with open(dataset_file, 'rb') as f:
                dataset = pickle.load(f)
                print('Loaded %s in dataset' % dataset_file)
        except Exception as e:
            print('Error: ', e)
    else:
        print('Unable to find ', dataset_file)
        dataset = None
    return dataset

In [136]:
# dataset = build_dataset(data_root, site)
# save_dataset(data_root, site, dataset)
# dataset = load_dataset(data_root, site)
# dataset

In [145]:
def randomized_and_split_dataset(data_root, site, dataset ,save=False):
    """"""
    data, new_dataset = {}, {}
    keys = np.asarray(list(dataset['data'].keys()))
    np.random.shuffle(keys)
    print('Shuffling dataset ...')
    new_dataset['detrend']=dataset['detrend']
    new_dataset['nfft']=dataset['nfft']
    new_dataset['noverlap']=dataset['noverlap']
    for key in keys:
        data[key] = dataset['data'][key]
    new_dataset['data'] = data
    if save:
        ext = '.rand_dat'
        save_dataset(data_root, site, new_dataset, extension=ext)
        print('Saved shuffled dataset to ', os.path.join(data_root, site+ext))
    return new_dataset

def split_dataset(data_root, site, dataset ,save=False):
    

In [146]:
dataset = load_dataset(data_root, site)
rand_dataset = randomized_dataset(data_root, site, dataset, save=True)

Loaded ./marion.dat in dataset
Shuffling dataset ...
Dataset saved to ./marion.rand_dat
Dataset size:  3377332
Saved shuffled dataset to  ./marion.rand_dat


In [147]:
rand_dataset['data']

{'2013-07-02UT16:09:06.07971914.marion.vr2': {'frequencies': array([    0.   ,    78.125,   156.25 ,   234.375,   312.5  ,   390.625,
           468.75 ,   546.875,   625.   ,   703.125,   781.25 ,   859.375,
           937.5  ,  1015.625,  1093.75 ,  1171.875,  1250.   ,  1328.125,
          1406.25 ,  1484.375,  1562.5  ,  1640.625,  1718.75 ,  1796.875,
          1875.   ,  1953.125,  2031.25 ,  2109.375,  2187.5  ,  2265.625,
          2343.75 ,  2421.875,  2500.   ,  2578.125,  2656.25 ,  2734.375,
          2812.5  ,  2890.625,  2968.75 ,  3046.875,  3125.   ,  3203.125,
          3281.25 ,  3359.375,  3437.5  ,  3515.625,  3593.75 ,  3671.875,
          3750.   ,  3828.125,  3906.25 ,  3984.375,  4062.5  ,  4140.625,
          4218.75 ,  4296.875,  4375.   ,  4453.125,  4531.25 ,  4609.375,
          4687.5  ,  4765.625,  4843.75 ,  4921.875,  5000.   ,  5078.125,
          5156.25 ,  5234.375,  5312.5  ,  5390.625,  5468.75 ,  5546.875,
          5625.   ,  5703.125,  5781.25 ,

In [None]:
# Data parameters
data_root = '.'
site = 'sanae'

# load all data file
data_location = os.path.join(data_root,site)
data_capture = os.listdir(os.path.join(data_root,site,site+'_data'))

data, sample, t0, t1 = vr2_to_panda(os.path.join(data_location, site+'_data'), 
                                    data_capture[np.random.randint(len(data_capture))], 
                                    site)
data_info = spectrogram(data, sample)
spectrum = data_info['spectrogram']
frequencies = data_info['frequencies']
times = data_info['times']
print(spectrum.shape, frequencies.shape, times.shape)
# plt.pcolormesh(times, frequencies, np.log10(spectrum), cmap=plt.get_cmap(cmap))