<h1>Import Libraries</h1>

In [18]:
import numpy as np
import pandas as pd
import os
import datetime
import time
import re

<h1>Define functions for pipeline</h1>
<h2>Find start timestamp</h2>

In [2]:
def find_start_date(filename=str):
    match = re.search(r'\d{4}\d{2}\d{2}_\d{2}\d{2}\d{2}', filename)
    date = datetime.datetime.strptime(match.group(), '%Y%m%d_%H%M%S')
    return date

2021-10-12 00:01:02.001000


<h2>Define cutting points</h2>

In [20]:
def cutting_points(unpickled):
    # create dataframe without audio records for split-steps von
    dict_no_audio = {}
    for k in unpickled.keys():
        if k != 'audio':
            dict_no_audio[k] = unpickled[k]

    #################### für 1h example ########################
    dict_no_audio['timer'] = np.append(dict_no_audio['timer'], 3599906)
    ############################################################
    
    # define range for split-steps of sensor records
    val_range = np.array(range(49,unpickled['timer'].max(),50))
    
    # split-steps of sensor records
    i_split_arr = []

    for ms in val_range:
        i_split_arr.append(dict_no_audio['timer'].searchsorted(ms, side='right'))
    
    # split-step of audio records
    step = int(2100 / 20)

    return i_split_arr, step

<h2>Normalized accelerometer/magnetometer</h2>

In [4]:
def add_normalized_sensors(unpickled):
    # calculate vektor length of sensors
    unpickled['accelerometer'] = (unpickled['accelerometer_x']**2 + unpickled['accelerometer_y']**2 + unpickled['accelerometer_z']**2)**0.5
    unpickled['magnetometer'] = (unpickled['magnetometer_x']**2 + unpickled['magnetometer_y']**2 + unpickled['magnetometer_z']**2)**0.5
    return unpickled

<h2>Split and claculate values for 20 Hz</h2>

In [24]:
def smoothing(unpickled, i_split_arr, step):
    # create dictionary with records (run split) & calculate min/max/quantil

    smt_dict = {}

    for variable in unpickled.keys():
        if "accelerometer" in variable:
            # Array wird anhand der Indizes der MS gesplittet, mit NaN values auf die selbe lenge gebracht und gestacked
            ac_split_arr = np.split(unpickled[variable], i_split_arr)
            pad = len(max(ac_split_arr, key=len))
            pad_ac_split_arr = np.array([np.append(i, [[np.nan]*(pad-len(i))]) for i in ac_split_arr])
            stack_ac_split_arr = np.vstack(pad_ac_split_arr)

            # Zusammenfassen und in dictionary schreiben, NaN values werden ignoriert
            # maximum
            smt_dict[variable+'_max'] = np.nanmax(stack_ac_split_arr, axis = 1)
            # 95% quantile
            smt_dict[variable+'_95q'] = np.nanquantile(stack_ac_split_arr, 0.95, axis = 1)
            # minimum
            smt_dict[variable+'_min'] = np.nanmin(stack_ac_split_arr, axis = 1)
            # 5% quantile
            smt_dict[variable+'_05q'] = np.nanquantile(stack_ac_split_arr, 0.05, axis = 1)

        elif "magnetometer" in variable:
            # Array wird anhand der Indizes der MS gesplittet, mit NaN values auf die selbe lenge gebracht und gestacked
            ma_split_arr = np.split(unpickled[variable], i_split_arr)
            pad = len(max(ma_split_arr, key=len))
            pad_ma_split_arr = np.array([np.append(i, [[np.nan]*(pad-len(i))]) for i in ma_split_arr])
            stack_ma_split_arr = np.vstack(pad_ma_split_arr)

            # Zusammenfassen und in dictionary schreiben, NaN values werden ignoriert
            # mean
            smt_dict[variable+'_mean'] = np.nanmean(stack_ma_split_arr, axis = 1)
            # median
            smt_dict[variable+'_med'] = np.nanmedian(stack_ma_split_arr, axis = 1)

        elif "audio" in variable:
            au_split_arr = np.vstack(np.split(unpickled[variable], range(step, unpickled['audio'].size, step)))

            # Zusammenfassen und in DataFrame schreiben, NaN values werden ignoriert
            # maximum
            smt_dict[variable+'_max'] = np.amax(au_split_arr, axis = 1)

            # 95% quantile
            smt_dict[variable+'_95q'] = np.quantile(au_split_arr, 0.95, axis = 1)
        
        elif "timer" in variable:
            smt_dict[variable] = np.array(range(0,unpickled['timer'].max(),50))
            if unpickled['timer'].max()%50:
                smt_dict[variable] = np.append(smt_dict[variable],(smt_dict[variable].max()+1))
            
    return smt_dict

<b>DataFrame erstellen</b>

In [9]:
# perform equality of sensor & audio records
def perform_equality():
    
    # determine difference
    diff = (smt_dict['audio_max'].size - smt_dict['accelerometer_x_max'].size)
    
    # more audio records
    if diff > 0: 
        
        # trim records under consideration of difference
        smt_dict['audio_max'] = smt_dict['audio_max'][0:smt_dict['audio_max'].size-diff]
        smt_dict['audio_95q'] = smt_dict['audio_95q'][0:smt_dict['audio_95q'].size-diff]
        
        # determine number of sensor records
        for key in smt_dict.keys():
            print (key, ': ', smt_dict[key].size)
        
    # more sensor records
    elif diff < 0: 
        return "NotImplementedError :D"
    
    # equal number of audio & sensor records
    elif diff == 0:
        return "Equality is given"


perform_equality()

NameError: name 'smt_dict' is not defined

In [None]:
# create dataframe 
df = pd.DataFrame.from_dict(smt_dict)
df

<b>Export als Pickle-File {temporary}</b>

In [None]:
# export dataframe as pickle-file
now = datetime.now()
date = now.strftime("%Y%m%d_%H%M")

df.to_pickle("./export_" + str(date) + ".pkl")

<h1>Find files in directory and start smoothing pipeline</h1>

In [25]:
data_inside = []
data_outside = []

for path, currentDirectory, files in os.walk("../WI3_BusinessIntelligence_Data/"):
    for file in files:
        if file.startswith("data_inside_") and file.endswith('_one_hour_example.pkl'):
            unpickled = pd.read_pickle(path+file)
            # find starting time of recording
            start_time = find_start_date(file)
            # find indices/step size for smoothing
            i_split_arr, step = cutting_points(unpickled)
            # optional: add normalized vectors for sensors to data
            unpickled = add_normalized_sensors(unpickled)
            # split and calculate values for 20 Hz
            smt_dict = smoothing(unpickled, i_split_arr, step)
            print(smt_dict)
        #elif file.startswith("data_outside_") and file.endswith('_one_hour_example.pkl'):
            #unpickled = pd.read_pickle(path+file)
            #start_time = find_start_date(file)
            #print(start_time)

  smt_dict[variable+'_max'] = np.nanmax(stack_ac_split_arr, axis = 1)
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
  smt_dict[variable+'_min'] = np.nanmin(stack_ac_split_arr, axis = 1)
  smt_dict[variable+'_mean'] = np.nanmean(stack_ma_split_arr, axis = 1)
  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


{'accelerometer_x_max': array([-7.01660156, -7.03125   , -7.02636719, ..., -6.93847656,
       -6.95800781, -7.06054688]), 'accelerometer_x_95q': array([-7.01733398, -7.04589844, -7.04101562, ..., -6.99291992,
       -6.98730469, -7.06054688]), 'accelerometer_x_min': array([-7.13378906, -7.14355469, -7.17285156, ..., -7.15820312,
       -7.20214844, -7.15820312]), 'accelerometer_x_05q': array([-7.11914062, -7.13378906, -7.14355469, ..., -7.15332031,
       -7.17138672, -7.15820312]), 'accelerometer_y_max': array([7.00683594, 7.00195312, 6.99707031, ..., 7.05078125, 7.09472656,
       6.94824219]), 'accelerometer_y_95q': array([7.0065918 , 6.99194336, 6.9921875 , ..., 7.01660156, 7.03540039,
       6.94824219]), 'accelerometer_y_min': array([6.90429688, 6.90917969, 6.92382812, ..., 6.74804688, 6.79199219,
       6.81640625]), 'accelerometer_y_05q': array([6.92871094, 6.92407227, 6.92944336, ..., 6.84570312, 6.81713867,
       6.85302734]), 'accelerometer_z_max': array([0.81054688, 0.849