## DATASET CLASS
_Represent a dataset of samples_

In [3]:
import os
import numpy as np

In [50]:
class Dataset(object):
    ''''''
    # Attributes
    __awda_output = None
    
    # Initializer
    def __init__(self,dataset_location,  site):
        self.__dataset_location = dataset_location
        self.__site = site
    
    # Getters
    def get_site(self):
        return self.__site
    
    def get_dataset_location(self):
        return self.__dataset_location
    
    # Access samples
    def get_samples_name(self):
        '''Get all samples datafiles
        params:
            awd_event
            site
        returns:
            datafiles
        '''
        data_location = os.path.join(self.__dataset_location, self.__site, self.__site+'_data')
        files = None
        if os.path.exists(data_location):
            files = [ file for file in os.listdir(data_location) if file.endswith('.vr2')] # only select .vr2 file
        return files

    def get_random_sample(self):
        '''Select a random datafile
        params:
            awd_event
            site
        returns:
            file name
        '''
        files = self.get_samples_name()
        return files[np.random.randint(len(files))]
    
    # Extract output based on AWDA method
    
    def _datetime_to_unit(self, datatime):
        '''Extract datetime information of the data file into 
        hours, minutes, seconds, and milliseconds
        params: 
            datetime 2013-01-27UT05:36:17.48387602
        return:
            [h,m,s,u]
        '''
        times = datatime.split('UT')
        h, m, ss = times[-1].split(':')
        s, u = ss.split('.')
        return [h,m,s,u]

    def _datetime_to_ms(self, datetime):
        '''Convert datetime to milliseconds
        params: 
            datetime 2013-01-27UT05:36:17.48387602
        return:
            datetime in ms'''
        datetime = self._datetime_to_unit(datetime)
        datetime[0] = float(datetime[0])*60*60
        datetime[1] = float(datetime[1])*60
        datetime[2] = float(datetime[2])
        datetime[3] = float(datetime[3])/10**(len(datetime[3]))
        return sum(datetime)

    def _datetime_diff(self,datetime2, datetime1):
        '''Difference between the event time and the start of the data collection'''
        return self._datetime_to_ms(datetime2)-self._datetime_to_ms(datetime1)

    def awda_output(self, verbose=False):
        """Extract the output information for each file
        inputs
        
        outputs
            dataset     dictionary mapping each file with the whistler location
        """
        output_path = os.path.join(self.__dataset_location, self.__site)
        output_file = None
        for file in os.listdir(output_path):
            if file.endswith('.out'):
                output_file = file
                break
        try:
            os.path.exists(output_file)
            with open(os.path.join(output_path, output_file), 'r') as f:
                dataset = {}
                num_line = 0
                lines = f.readlines()
                file_list = []
                last_percent = None
                if verbose:
                    print('\nGenerating outputs for %s' % self.__site)
                for line in lines:
                    event = {}
                    line = line.split('\n') # Remove the '\n' character from each line
                    line = line[0].split(' ') 
                    line = list(filter(None, line)) # discard empty element in array
                    for index in range(2,len(line),2): # store event and probabilities in a dictionary
                        event[line[index]]=line[index+1]
                    # save the dictionary
                    if line[1] not in file_list: # if file name not in the list
                        dataset[line[1]]=event
                        file_list.append(line[1])
                    else:
                        data = dataset[line[1]]
                        event.update(data)
                        dataset[line[1]]=event
                    # print progression
                    percent = int(num_line*100/len(lines))
                    if last_percent != percent:
                        if percent%5==0 and verbose==True:
                            sys.stdout.write("%s%%" % percent)
                            sys.stdout.flush()
                        elif verbose==True:
                            sys.stdout.write(".")
                            sys.stdout.flush()
                        last_percent = percent
                    num_line+=1
        except Exception as e:
            print('Error:', e)
        self.__awda_output = dataset
        return dataset
    
    def get_awda_output(self):
        return self.__awda_output
