In [1]:
import numpy as np
import matplotlib.pyplot as plt

# You may change the mhealth_activity module but your algorithm must support the original version
from mhealth_activity import Recording, Trace, Activity, WatchLocation, Path

# For interactive plots, uncomment the following line
# %matplotlib widget
import os
import pandas as pd
import pickle
from tqdm import tqdm
from scipy.fft import fft, fftfreq
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from scipy.signal import find_peaks
from scipy.signal import peak_prominences
from sklearn.metrics import mean_absolute_error,accuracy_score,precision_score,recall_score,confusion_matrix,classification_report,f1_score
from multiprocessing import Pool

Load feature extractor for accelerometer, gyroscope and magnetometer

In [2]:
def features_extraction_common(inp): 
    FEATURES = ['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f','avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']

    
    print(FEATURES)
    Median=[];Numneg=[];Numpos=[];Numabovmed=[];
    Min=[];Max=[];Mean=[];Mad=[];Sma=[];Eng=[];Iqr=[];Entr=[];Std=[];Var=[];Kurt=[];Skew=[];Npeaks=[];Avgprom=[]
    Min_d=[];Max_d=[];Mean_d=[];Mad_d=[];Sma_d=[];Eng_d=[];Iqr_d=[];Entr_d=[];Std_d=[];Var_d=[];
    Max_f=[];NPeak_f=[];Avgprom_f=[];Mean_f=[];Skew_f=[];Kurtosis_f=[];Sum_f=[]; Avgpeakdist=[]; Avgpeakdist_f=[];
    
    #X = df.values
    ## TIME DOMAIN ##
    #list of lists of lists, ugly as fuck but it works 
    Median.append(np.median(inp))
    Numneg.append(np.sum(np.array(inp) < 0, axis=0))
    Numpos.append(np.sum(np.array(inp) > 0, axis=0))
    Numabovmed.append(np.sum(np.array(inp) > np.median(inp), axis=0))

    Mean.append(np.mean(inp))
    Std.append(np.std(inp))
    #median absolute deviation
    Mad.append(stats.median_abs_deviation(inp, scale=1))
    Var.append(np.var(inp))
    Min.append(np.min(inp))
    Max.append(np.max(inp))
    #Signal Magnitude Area
    Sma.append(np.sum(inp))
    #energy measure
    Eng.append(np.sum(inp**2)/len(inp))
    Iqr.append(stats.iqr(inp))
    Entr.append(stats.entropy(inp))

    npeaks, _ = find_peaks(inp, distance=5)
    Npeaks.append(len(npeaks))
    prom = peak_prominences(inp, npeaks)
    Avgprom.append(np.mean(prom))

    Apeakdist = 0
    for i in range(len(npeaks)-1):
        Apeakdist += abs(npeaks[i] - npeaks[i+1])
    Avgpeakdist.append(Apeakdist/(len(npeaks)-1))

    ## FREQ DOMAIN ##
    ft = np.abs(fft(inp))
    Sum_f.append(np.sum(ft))
    Max_f.append(np.max(ft))

    npeaks, _ = find_peaks(ft, distance=5)
    NPeak_f.append(len(npeaks))
    prom = peak_prominences(ft, npeaks)
    Avgprom_f.append(np.mean(prom))

    Apeakdist = 0
    for i in range(len(npeaks)-1):
        Apeakdist += abs(npeaks[i] - npeaks[i+1])
    Avgpeakdist_f.append(Apeakdist/(len(npeaks)-1))

    Mean_f.append(np.mean(ft))
    Skew_f.append(stats.skew(ft))
    Kurtosis_f.append(stats.kurtosis(ft))

    #derivative
    # f = np.gradient(inp)
    # Mean_d.append(np.mean(inp))
    # Std_d.append(np.std(recording[0]))
    # #median absolute deviation
    # Mad_d.append(stats.median_abs_deviation(recording[0], scale=1))
    # Var_d.append(np.var(recording[0]))
    # Min_d.append(np.min(recording[0]))
    # Max_d.append(np.max(recording[0]))
    # #Signal Magnitude Area
    # Sma_d.append(np.sum(recording[0]))
    # #energy measure
    # Eng_d.append(np.sum(recording[0]**2)/len(recording[0]))
    # Iqr_d.append(stats.iqr(recording[0]))
    # Entr_d.append(stats.entropy(recording[0]))

    #Create dataframe from features
    return np.array([Median, Numneg, Numpos, Numabovmed, Mean,Std, Mad, Var, Min, Max, Sma, Eng, Iqr, Entr, Npeaks, Avgprom, Avgpeakdist, Sum_f, Max_f, NPeak_f, Avgprom_f, Avgpeakdist_f, Mean_f, Skew_f, Kurtosis_f]).reshape(-1) 



In [3]:
#load pickled training 3d norm accelerometer data
file = open('doruks_data/accel_mag_train.pkl', 'rb')
pickled = pickle.load(file)
accel_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['accel'])
print(accel_mag_train.shape)
file.close()

file = open('doruks_data/magneto_mag_train.pkl', 'rb')
pickled = pickle.load(file)
magneto_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['magneto'])
print(magneto_mag_train.shape)
file.close()

file = open('doruks_data/gyro_mag_train.pkl', 'rb')
pickled = pickle.load(file)
gyro_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['gyro'])
print(gyro_mag_train.shape)
file.close()
""" #data = pd.read_pickle('doruks_data/pickled_and_sorted_training_data.pkl.zst')

pos_labels  = []
path_labels = []
for label in data["labels"]:
    path_labels.extend([label["path_idx"]])
    pos_labels.extend([label["watch_loc"]]) """




(396, 1)
(396, 1)
(396, 1)


' #data = pd.read_pickle(\'doruks_data/pickled_and_sorted_training_data.pkl.zst\')\n\npos_labels  = []\npath_labels = []\nfor label in data["labels"]:\n    path_labels.extend([label["path_idx"]])\n    pos_labels.extend([label["watch_loc"]]) '

In [4]:
file = open('doruks_data/accel_gyro_magneto_timestamps.pkl', 'rb')


timestamps = pickle.load(file)

In [5]:
timestamps.shape

(396, 3)

In [6]:
# Seemingly, even though accel/gyro_time and magneto_time is different, the total recording time remains the same

for i in range(0, timestamps.shape[0]):

    print("Difference: ", timestamps["gyro_time"][i][-1] - timestamps["magneto_time"][i][-1])




Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0

In [7]:
unique_one = pd.read_pickle('unique_activity_ONE.pkl')
unique_two = pd.read_pickle('unique_activity_TWO.pkl')
unique_three = pd.read_pickle('unique_activity_THREE.pkl')

unique_all = pd.concat([unique_one, unique_two, unique_three])

In [None]:
unique_indices = unique_all.index
gyro_mag_train_unique = gyro_mag_train.loc[unique_indices]
magneto_mag_train_unique = magneto_mag_train.loc[unique_indices]
accel_mag_train_unique = accel_mag_train.loc[unique_indices]

timestamps_unique = timestamps.loc[unique_indices]

In [None]:
unique_all = pd.concat([unique_all, timestamps_unique, gyro_mag_train_unique, magneto_mag_train_unique, accel_mag_train_unique], axis = 1)

In [None]:
unique_all

Unnamed: 0,path_idx,activities,longitude,latitude,speed,altitude,step_count,phone_steps,accel_time,gyro_time,magneto_time,gyro,magneto,accel
0,2,[1],"Trace(title='longitude', total_time=584.87, sa...","Trace(title='latitude', total_time=584.87, sam...","Trace(title='speed', total_time=584.87, sample...","Trace(title='altitude', total_time=584.87, sam...",,"Trace(title='phone_steps', total_time=584.87, ...","[0.0, 0.004992479791038916, 0.0099849595820778...","[0.0, 0.004992479791038916, 0.0099849595820778...","[0.0, 0.07988990575058053, 0.15977981150116105...","[53.384678630069224, 55.44401057024837, 54.294...","[221.89445350659278, 221.10276820131642, 221.3...","[0.9480597873558152, 0.8834795162112591, 0.826..."
3,2,[1],"Trace(title='longitude', total_time=519.63, sa...","Trace(title='latitude', total_time=519.63, sam...","Trace(title='speed', total_time=519.63, sample...","Trace(title='altitude', total_time=519.63, sam...",,,"[0.0, 0.004998807130282537, 0.0099976142605650...","[0.0, 0.004998807130282537, 0.0099976142605650...","[0.0, 0.07999245689655173, 0.15998491379310345...","[2.1428685815022215, 2.3109384227933973, 2.486...","[248.5280522182176, 248.5280522182176, 248.528...","[0.9713710479087602, 0.9788491672371056, 0.979..."
4,1,[1],"Trace(title='longitude', total_time=625.44, sa...","Trace(title='latitude', total_time=625.44, sam...","Trace(title='speed', total_time=625.44, sample...","Trace(title='altitude', total_time=625.44, sam...",,,"[0.0, 0.005001351405381632, 0.0100027028107632...","[0.0, 0.005001351405381632, 0.0100027028107632...","[0.0, 0.08003122200895713, 0.16006244401791425...","[3.500952797321439, 3.553201645342359, 3.49266...","[271.8695491537745, 271.8695491537745, 269.464...","[0.9858525450565528, 0.9902738944131763, 0.984..."
5,1,[1],"Trace(title='longitude', total_time=499.81, sa...","Trace(title='latitude', total_time=499.81, sam...","Trace(title='speed', total_time=499.81, sample...","Trace(title='altitude', total_time=499.81, sam...",,"Trace(title='phone_steps', total_time=499.81, ...","[0.0, 0.005002932844859514, 0.0100058656897190...","[0.0, 0.005002932844859514, 0.0100058656897190...","[0.0, 0.08005894601954189, 0.16011789203908378...","[1.546642179982124, 1.4506286076538246, 1.6252...","[68.80874601448878, 68.80874601448878, 68.8087...","[1.0085702753031478, 1.0100548807662528, 1.008..."
6,3,[1],"Trace(title='longitude', total_time=473.97, sa...","Trace(title='latitude', total_time=473.97, sam...","Trace(title='speed', total_time=473.97, sample...","Trace(title='altitude', total_time=473.97, sam...",,"Trace(title='phone_steps', total_time=473.97, ...","[0.0, 0.00500227965931758, 0.01000455931863516...","[0.0, 0.00500227965931758, 0.01000455931863516...","[0.0, 0.08004914710352981, 0.16009829420705962...","[13.978758128803513, 11.405993305218088, 7.896...","[61.56632930124062, 62.093556897044365, 62.093...","[0.9529625801153142, 0.9204687610859984, 0.940..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,4,[2],"Trace(title='longitude', total_time=237.11, sa...","Trace(title='latitude', total_time=237.11, sam...","Trace(title='speed', total_time=237.11, sample...","Trace(title='altitude', total_time=237.11, sam...",,"Trace(title='phone_steps', total_time=237.11, ...","[0.0, 0.00500162423270825, 0.0100032484654165,...","[0.0, 0.00500162423270825, 0.0100032484654165,...","[0.0, 0.08005131667792033, 0.16010263335584066...","[57.52947564292359, 53.113549940974494, 48.797...","[179.98295915558447, 179.98295915558447, 179.9...","[1.1580717066912989, 1.1404327596370545, 1.127..."
320,1,[2],"Trace(title='longitude', total_time=321.26, sa...","Trace(title='latitude', total_time=321.26, sam...","Trace(title='speed', total_time=321.26, sample...","Trace(title='altitude', total_time=321.26, sam...",,"Trace(title='phone_steps', total_time=321.26, ...","[0.0, 0.005000965145783714, 0.0100019302915674...","[0.0, 0.005000965145783714, 0.0100019302915674...","[0.0, 0.08003413054309916, 0.16006826108619832...","[7.029947849072784, 6.943235275511577, 7.10389...","[184.6015820688411, 184.6015820688411, 184.601...","[1.0403482040369971, 1.0409539581985816, 1.041..."
379,2,[2],"Trace(title='longitude', total_time=278.21, sa...","Trace(title='latitude', total_time=278.21, sam...","Trace(title='speed', total_time=278.21, sample...","Trace(title='altitude', total_time=278.21, sam...",,"Trace(title='phone_steps', total_time=278.21, ...","[0.0, 0.005000916755046647, 0.0100018335100932...","[0.0, 0.005000916755046647, 0.0100018335100932...","[0.0, 0.08003624856156502, 0.16007249712313004...","[5.855300599343808, 5.383044731988208, 5.00855...","[179.33851951368842, 179.33851951368842, 179.3...","[0.9916801356888864, 0.9891451651933957, 0.993..."
150,3,"[2, 3]","Trace(title='longitude', total_time=464.79, sa...","Trace(title='latitude', total_time=464.79, sam...","Trace(title='speed', total_time=464.79, sample...","Trace(title='altitude', total_time=464.79, sam...",,"Trace(title='phone_steps', total_time=464.79, ...","[0.0, 0.005004242078403083, 0.0100084841568061...","[0.0, 0.005004242078403083, 0.0100084841568061...","[0.0, 0.08008080634045486, 0.16016161268090973...","[32.98797039249145, 31.148705374649655, 28.903...","[157.39305172536064, 158.76545395559904, 156.9...","[1.1274485432963524, 1.146372515080797, 1.1592..."


In [None]:
""" index = unique_indices

features_of_interest = np.array(["gyro", "magneto", "accel"])



extracted_data_dictionary = {}

for sample_index, sample in unique_all.iterrows():

    print(sample_index)
    feature_dictionary = {}

    for feature_name in features_of_interest:

        sample = unique_all.loc[sample_index][feature_name]


        print("Feature Name:", feature_name)
        print(" Time           Value")

        # Initialize a dictionary to store steps for each second
        values_per_second = {}


        if sample is not None:

            # Iterate over the timestamps and values
            for t, x in zip(sample.timestamps, sample.values):
                # Get the second part of the timestamp as the key
                #if int(t) % 10 == 0 AND #no other 10er in dict:  
                 #   print("10er:")

                second = int(t) #put in if statement for other intervals of pooling
                #print("second:", second, "real: ", t)
                # If the second is not in the dictionary, initialize it with an empty list
                if second not in values_per_second:
                    values_per_second[second] = np.array([])
                # Add the steps to the list for the current second
                values_per_second[second] = np.append(values_per_second[second], x)

            seconds_array = np.array([])
            averages_array = np.array([])

            # Calculate the average steps for each second

            
            for second, values in values_per_second.items():
                average_values = features_extraction_common(values) #np.average(values) #sum(values) / len(values)
                #print(f"{second}s \t{average_values:.5f} steps")

                seconds_array = np.append(seconds_array, second)
                averages_array = np.append(averages_array, average_values)
                

            #das pooled alles in eine sekunde jeweils, eine überlegung wert
            
            feature_dictionary["seconds"] = seconds_array
            feature_dictionary[feature_name]  = averages_array

        else:
            feature_dictionary[feature_name]  = np.array([])            

    #print(feature_dictionary)

    extracted_data_dictionary[sample_index] = feature_dictionary """

' index = unique_indices\n\nfeatures_of_interest = np.array(["gyro", "magneto", "accel"])\n\n\n\nextracted_data_dictionary = {}\n\nfor sample_index, sample in unique_all.iterrows():\n\n    print(sample_index)\n    feature_dictionary = {}\n\n    for feature_name in features_of_interest:\n\n        sample = unique_all.loc[sample_index][feature_name]\n\n\n        print("Feature Name:", feature_name)\n        print(" Time           Value")\n\n        # Initialize a dictionary to store steps for each second\n        values_per_second = {}\n\n\n        if sample is not None:\n\n            # Iterate over the timestamps and values\n            for t, x in zip(sample.timestamps, sample.values):\n                # Get the second part of the timestamp as the key\n                #if int(t) % 10 == 0 AND #no other 10er in dict:  \n                 #   print("10er:")\n\n                second = int(t) #put in if statement for other intervals of pooling\n                #print("second:", second, "r

In [None]:
unique_all.loc[0]["gyro_time"]


array([0.00000000e+00, 4.99247979e-03, 9.98495958e-03, ...,
       5.84864015e+02, 5.84869008e+02, 5.84874000e+02])

In [None]:
unique_all["gyro_time"][0]


array([0.00000000e+00, 4.99247979e-03, 9.98495958e-03, ...,
       5.84864015e+02, 5.84869008e+02, 5.84874000e+02])

In [None]:
unique_all.shape[0]

225

Without Pooling but with 10s Chunks

In [None]:
index = unique_indices

features_of_interest = np.array(["gyro", "magneto", "accel"])

chunk_size = 10

extracted_data_dictionary = {}

for sample_index, sample in unique_all.iterrows():

    print("Sample_index: ", sample_index)
    feature_dictionary = {}

    print("Progress: ", sample_index/unique_all.shape[0], "%")

    #print("Sample: ", sample)

    for feature_name in features_of_interest:

        sample_values = unique_all.loc[sample_index][feature_name]

        #print(feature_name)

        time_name = f'{feature_name}_time'

        #print(time_name)
        #time_column = unique_all[sample_index].filter(regex=f'{feature_name}.*_time$').columns


        sample_timestamps = unique_all[time_name][sample_index]

        #print("sample: ", sample)

        print("Feature Name:", feature_name, time_name)
        #print(" Time           Value")

        # Initialize a dictionary to store steps for each second
        values_per_interval = {}

        if sample is not None:

            # Iterate over the timestamps and values
            for t, x in zip(sample_timestamps, sample_values):
                # Get the second part of the timestamp as the key

                second = int(t) #put in if statement for other intervals of pooling

                if second % chunk_size == 0 & second not in values_per_interval:
                    values_per_interval[second] = np.array([[], []])

                    #update the second
                    last_second = second

                    print("Chunk:", last_second)                


                # Add the timestamps and values to the list for the current interval
                
                print("Timestamp:", t)
                print("Value:", x)

                new_value = np.array([[t], [x]])



                values_per_interval[last_second] = np.concatenate((values_per_interval[last_second], new_value), axis = 1)

            # Calculate the average steps for each second

            
            intervals_array = np.array([])
            embeddings_array = np.array([])

            for interval, values in values_per_interval.items():
                average_values = features_extraction_common(values[1]) #np.average(values) #sum(values) / len(values)
                #print(f"{second}s \t{average_values:.5f} steps")

                print(average_values)

                intervals_array = np.append(intervals_array, interval)
                embeddings_array = np.append(embeddings_array, average_values)
                

            #das pooled alles in eine sekunde jeweils, eine überlegung wert
            
            feature_dictionary["intervals"] = intervals_array
            feature_dictionary[feature_name]  = embeddings_array

        else:
            feature_dictionary[feature_name]  = np.array([])            

    #print(feature_dictionary)

    extracted_data_dictionary[sample_index] = feature_dictionary

Sample_index:  0
Progress:  0.0 %
Feature Name: gyro gyro_time
Chunk: 0
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f', 'avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']
Feature Name: magneto magneto_time
Chunk: 0
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f', 'avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']
Feature Name: accel accel_time
Chunk: 0


KeyboardInterrupt: 