<h1>Import Libraries</h1>

In [1]:
import numpy as np
import pandas as pd
import os
import datetime
import time
import re

<h1>Define functions for pipeline</h1>
<h2>Find start timestamp</h2>

In [42]:
def find_start_date(filename=str):
    match = re.search(r'\d{4}\d{2}\d{2}_\d{2}\d{2}\d{2}', filename)
    date = datetime.datetime.strptime(match.group(), '%Y%m%d_%H%M%S')
    return date

<h2>Define cutting points</h2>

In [43]:
def cutting_points(unpickled):
    # create dataframe without audio records for split-steps von
    dict_no_audio = {}
    for k in unpickled.keys():
        if k != 'audio':
            dict_no_audio[k] = unpickled[k]

    #################### für 1h example ########################
    #dict_no_audio['timer'] = np.append(dict_no_audio['timer'], 3599906)
    ############################################################
    
    
    df_no_audio = pd.DataFrame()
    # define range for split-steps of sensor records
    #val_range = np.array(range(49,unpickled['timer'].max(),50))
    
    # split-steps of sensor records
    #i_split_arr = []

    #for ms in val_range:
    #    i_split_arr.append(dict_no_audio['timer'].searchsorted(ms, side='right'))
    
    # split-step of audio records
    step = int(2100 / 20)

    return step, df_no_audio

<h2>Normalized accelerometer/magnetometer</h2>

In [4]:
def add_normalized_sensors(unpickled):
    # calculate vektor length of sensors
    unpickled['accelerometer'] = (unpickled['accelerometer_x']**2 + unpickled['accelerometer_y']**2 + unpickled['accelerometer_z']**2)**0.5
    unpickled['magnetometer'] = (unpickled['magnetometer_x']**2 + unpickled['magnetometer_y']**2 + unpickled['magnetometer_z']**2)**0.5
    return unpickled

<h2>Split and claculate values for 20 Hz</h2>

In [52]:
def smoothing(unpickled, i_split_arr, step):
    # create dictionary with records (run split) & calculate min/max/quantil

    smt_dict = {}

    for variable in unpickled.keys():
        if "accelerometer" in variable:
            # Array wird anhand der Indizes der MS gesplittet, mit NaN values auf die selbe lenge gebracht und gestacked
            ac_split_arr = np.split(unpickled[variable], i_split_arr)
            pad = len(max(ac_split_arr, key=len))
            pad_ac_split_arr = np.array([np.append(i, [[np.nan]*(pad-len(i))]) for i in ac_split_arr])
            stack_ac_split_arr = np.vstack(pad_ac_split_arr)

            # Zusammenfassen und in dictionary schreiben, NaN values werden ignoriert
            # maximum
            smt_dict[variable+'_max'] = np.nanmax(stack_ac_split_arr, axis = 1)
            # 95% quantile
            smt_dict[variable+'_95q'] = np.nanquantile(stack_ac_split_arr, 0.95, axis = 1)
            # minimum
            smt_dict[variable+'_min'] = np.nanmin(stack_ac_split_arr, axis = 1)
            # 5% quantile
            smt_dict[variable+'_05q'] = np.nanquantile(stack_ac_split_arr, 0.05, axis = 1)

        elif "magnetometer" in variable:
            # Array wird anhand der Indizes der MS gesplittet, mit NaN values auf die selbe lenge gebracht und gestacked
            ma_split_arr = np.split(unpickled[variable], i_split_arr)
            pad = len(max(ma_split_arr, key=len))
            pad_ma_split_arr = np.array([np.append(i, [[np.nan]*(pad-len(i))]) for i in ma_split_arr])
            stack_ma_split_arr = np.vstack(pad_ma_split_arr)

            # Zusammenfassen und in dictionary schreiben, NaN values werden ignoriert
            # mean
            smt_dict[variable+'_mean'] = np.nanmean(stack_ma_split_arr, axis = 1)
            # median
            smt_dict[variable+'_med'] = np.nanmedian(stack_ma_split_arr, axis = 1)

        elif "audio" in variable:
            au_split_arr = np.vstack(np.split(unpickled[variable], range(step, unpickled['audio'].size, step)))

            # Zusammenfassen und in DataFrame schreiben, NaN values werden ignoriert
            # maximum
            smt_dict[variable+'_max'] = np.amax(au_split_arr, axis = 1)

            # 95% quantile
            smt_dict[variable+'_95q'] = np.quantile(au_split_arr, 0.95, axis = 1)
        
        elif "timer" in variable:
            smt_dict[variable] = np.array(range(0,unpickled['timer'].max(),50))
            if unpickled['timer'].max()%50 == 0:
                smt_dict[variable] = np.append(smt_dict[variable],(smt_dict[variable].max()+1))
            
    return smt_dict

<b>DataFrame erstellen</b>

In [45]:
# perform equality of sensor & audio records
def perform_equality(smt_dict):
    diff = (smt_dict['audio_max'].size - smt_dict['accelerometer_x_max'].size)
    
    # more audio records
    if diff > 0: 
        
        # trim records under consideration of difference
        smt_dict['audio_max'] = smt_dict['audio_max'][:-diff]
        smt_dict['audio_95q'] = smt_dict['audio_95q'][:-diff]
        
        return smt_dict
    
    # equal number of audio & sensor records
    elif diff == 0:
        return smt_dict
    
    else:
        print('equality Error - more sensor data than audio recods')
    



In [46]:
def save_to_csv(smt_dict, start_time, inside=bool):
    # cutt arrays to same length and create Data Frame
    smt_dict = perform_equality(smt_dict)   
    df = pd.DataFrame.from_dict(smt_dict)
    
    # add timestamp for records
    df['datetime'] = df.apply(lambda row: start_time + datetime.timedelta(milliseconds = row.timer), axis = 1)
    df.drop(columns = ['timer'], inplace = True)
    df.set_index('datetime', inplace = True)
    df
    
    # write dataframe to csv
    if inside:
        if os.path.isfile('data/data_inside.csv'):
            df.to_csv('data/data_inside.csv', mode='a', header=False)
        else:
            df.to_csv('data/data_inside.csv', mode='w', header=True)
    else:
        if os.path.isfile('data/data_outside.csv'):
            df.to_csv('data/data_outside.csv', mode='a', header=False)
        else:
            df.to_csv('data/data_outside.csv', mode='w', header=True)
    

<h1>Find files in directory and start smoothing pipeline</h1>

In [None]:
data_inside = []
data_outside = []

for path, currentDirectory, files in os.walk("data/"):
    for file in files:
        if file.startswith("data_inside_") and not file.endswith('_one_hour_example.pkl'):
            time_s = time.time()
            
            unpickled = pd.read_pickle(path+file)
            # find starting time of recording
            start_time = find_start_date(file)
            # find indices/step size for smoothing
            step = cutting_points(unpickled)
            # optional: add normalized vectors for sensors to data
            #unpickled = add_normalized_sensors(unpickled)
            # split and calculate values for 20 Hz
            smt_dict = smoothing(unpickled, df_no_audio, step)
            # export to csv
            save_to_csv(smt_dict, start_time, inside=True)
            
            print('executed "', path+file, '" in %s seconds' % (time.time() - time_s))
            
        elif file.startswith("data_outside_") and not file.endswith('_one_hour_example.pkl'):
            time_s = time.time()
            
            unpickled = pd.read_pickle(path+file)
            # find starting time of recording
            start_time = find_start_date(file)
            # find indices/step size for smoothing
            i_split_arr, step = cutting_points(unpickled)
            # optional: add normalized vectors for sensors to data
            unpickled = add_normalized_sensors(unpickled)
            # split and calculate values for 20 Hz
            smt_dict = smoothing(unpickled, i_split_arr, step)
            # export to csv
            save_to_csv(smt_dict, start_time, inside=False)
            
            print('executed "', path+file, '" in %s seconds' % (time.time() - time_s))

  smt_dict[variable+'_max'] = np.nanmax(stack_ac_split_arr, axis = 1)
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
  smt_dict[variable+'_min'] = np.nanmin(stack_ac_split_arr, axis = 1)
  smt_dict[variable+'_mean'] = np.nanmean(stack_ma_split_arr, axis = 1)
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


executed " data/data_inside_20211012_000102.pkl " in 1534.4019498825073 seconds


  smt_dict[variable+'_max'] = np.nanmax(stack_ac_split_arr, axis = 1)
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
  smt_dict[variable+'_min'] = np.nanmin(stack_ac_split_arr, axis = 1)
  smt_dict[variable+'_mean'] = np.nanmean(stack_ma_split_arr, axis = 1)
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


executed " data/data_inside_20211013_000100.pkl " in 1566.260300397873 seconds


  smt_dict[variable+'_max'] = np.nanmax(stack_ac_split_arr, axis = 1)
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
  smt_dict[variable+'_min'] = np.nanmin(stack_ac_split_arr, axis = 1)
  smt_dict[variable+'_mean'] = np.nanmean(stack_ma_split_arr, axis = 1)
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


executed " data/data_inside_20211014_000058.pkl " in 1518.2803301811218 seconds


  smt_dict[variable+'_max'] = np.nanmax(stack_ac_split_arr, axis = 1)
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
  smt_dict[variable+'_min'] = np.nanmin(stack_ac_split_arr, axis = 1)
  smt_dict[variable+'_mean'] = np.nanmean(stack_ma_split_arr, axis = 1)
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


executed " data/data_inside_20211015_000106.pkl " in 1548.3978290557861 seconds


  smt_dict[variable+'_max'] = np.nanmax(stack_ac_split_arr, axis = 1)
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
  smt_dict[variable+'_min'] = np.nanmin(stack_ac_split_arr, axis = 1)


In [51]:
#Determine number of sensor records
for key in smt_dict.keys():
    print (key, ': ', smt_dict[key].size)

print(unpickled['timer'].max())

accelerometer_x_max :  1725554
accelerometer_x_95q :  1725554
accelerometer_x_min :  1725554
accelerometer_x_05q :  1725554
accelerometer_y_max :  1725554
accelerometer_y_95q :  1725554
accelerometer_y_min :  1725554
accelerometer_y_05q :  1725554
accelerometer_z_max :  1725554
accelerometer_z_95q :  1725554
accelerometer_z_min :  1725554
accelerometer_z_05q :  1725554
magnetometer_x_mean :  1725554
magnetometer_x_med :  1725554
magnetometer_y_mean :  1725554
magnetometer_y_med :  1725554
magnetometer_z_mean :  1725554
magnetometer_z_med :  1725554
audio_max :  1725554
audio_95q :  1725554
timer :  1725553
accelerometer_max :  1725554
accelerometer_95q :  1725554
accelerometer_min :  1725554
accelerometer_05q :  1725554
magnetometer_mean :  1725554
magnetometer_med :  1725554
86277650
