<h1>Import Libraries</h1>

In [1]:
import numpy as np
import pandas as pd
import os
import datetime
import time
import re

<h1>Define functions for pipeline</h1>
<h2>Find start timestamp</h2>

In [2]:
def find_start_date(filename=str):
    match = re.search(r'\d{4}\d{2}\d{2}_\d{2}\d{2}\d{2}', filename)
    date = datetime.datetime.strptime(match.group(), '%Y%m%d_%H%M%S')
    return date

<h2>seperate audio and create dataframes</h2>

In [3]:
def create_dataframes(unpickled, start_time):
    # create dataframe without audio records
    dict_no_audio = {}
    for k in unpickled.keys():
        if k != 'audio':
            dict_no_audio[k] = unpickled[k]

    #################### für 1h example ########################
    #dict_no_audio['timer'] = np.append(dict_no_audio['timer'], 3599906)
    ############################################################
        
    df_sensor = pd.DataFrame(dict_no_audio)
    # change timer to datetime and make it index of the dataframe
    df_sensor['timer'] = pd.to_timedelta(df_sensor.timer, unit='ms')
    df_sensor['ms_delta'] = df_sensor['timer']
    
    #df_sensor['datetime'] = df_sensor.apply(lambda row: start_time + datetime.timedelta(milliseconds = row.timer), axis = 1)
    #df_sensor.drop(columns = ['timer'], inplace = True)
    #df_sensor.set_index('datetime', inplace = True)
    
    # create Data Frame with audio records
    df_audio = pd.DataFrame(unpickled['audio'])
    df_audio.rename(columns={0: "audio"}, inplace = True)
    # create date range
    df_audio['datetime'] = pd.date_range(start_time, start_time+df_sensor.timer.max(), periods = len(df_audio.index))
    
    # set date range as index
    df_sensor.set_index('timer', inplace = True)
    df_audio.set_index('datetime', inplace = True)
    

    return df_sensor, df_audio

<h2>Normalized accelerometer/magnetometer</h2>

In [4]:
def add_normalized_sensors(df_sensor):
    # calculate vektor length of sensors
    df_sensor['accelerometer'] = (df_sensor['accelerometer_x']**2 + df_sensor['accelerometer_y']**2 + df_sensor['accelerometer_z']**2)**0.5
    df_sensor['magnetometer'] = (df_sensor['magnetometer_x']**2 + df_sensor['magnetometer_y']**2 + df_sensor['magnetometer_z']**2)**0.5
    return df_sensor

<h2>Split and claculate values for 20 Hz</h2>

In [5]:
def resampling(df_sensor, df_audio, start_time):
    df_agg = pd.DataFrame()
    r_sensor = df_sensor.resample('50ms')
    r_audio = df_audio.resample('50ms')
    for variable in df_sensor.columns:
        if "accelerometer" in variable:
            # werte für das resample objekt berechnen
            # maximum
            df_agg[variable+'_max'] = r_sensor[variable].max()
            # 95% quantile
            df_agg[variable+'_95q'] = r_sensor[variable].quantile(0.95)
            # minimum
            df_agg[variable+'_min'] = r_sensor[variable].min()
            # 5% quantile
            df_agg[variable+'_05q'] = r_sensor[variable].quantile(0.05)

        elif "magnetometer" in variable:
            # werte für das resample objekt berechnen
            # mean
            df_agg[variable+'_mean'] = r_sensor[variable].mean()
            # median
            df_agg[variable+'_med'] = r_sensor[variable].median()
            
    df_agg['datetime'] = pd.date_range(start_time, start_time+df_sensor.ms_delta.max(), freq = '50ms')
    df_agg.set_index('datetime', inplace = True)

    for variable in df_audio.columns:
        if "audio" in variable:
            # werte für das resample objekt berechnen
            # maximum
            df_agg[variable+'_max'] = r_audio[variable].max()

            # 95% quantile
            df_agg[variable+'_95q'] = r_audio[variable].quantile(0.95)

            
    return df_agg

<h1>Find files in directory and start smoothing pipeline</h1>

In [8]:
path = '../Data/'
filename = 'data_inside_20211017_160006.pkl'

# test for one file
time_s = time.time()

unpickled = pd.read_pickle(path+filename)
print('read file, ', '%s seconds' % (time.time() - time_s))

# find starting time of recording
start_time = find_start_date(filename)
print('found starttime, ', '%s seconds' % (time.time() - time_s))

# create data frames for sensors and audio with timer as index
df_sensor, df_audio = create_dataframes(unpickled, start_time)
print('dataframes created, ', '%s seconds' % (time.time() - time_s))

# add vektor length for sensors as variable
df_sensor = add_normalized_sensors(df_sensor)
print('added normalized sensors, ', '%s seconds' % (time.time() - time_s))

# resample to 20 Hz
df_agg = resampling(df_sensor, df_audio, start_time)
print('aggregation, ', '%s seconds' % (time.time() - time_s))

# write to csv file
#save_to_csv(df_agg, inside=True)

print('executed "', path+filename, '" in %s seconds' % (time.time() - time_s))


read file,  0.45586228370666504 seconds
found starttime,  0.45676684379577637 seconds
dataframes created,  9.474449872970581 seconds
added normalized sensors,  10.244431257247925 seconds
aggregation,  137.38002467155457 seconds
executed " ../Data/data_inside_20211017_160006.pkl " in 137.38017439842224 seconds


In [None]:
df_agg

In [None]:
df_audio