In [1]:
import numpy as np
import matplotlib.pyplot as plt

# You may change the mhealth_activity module but your algorithm must support the original version
from mhealth_activity import Recording, Trace, Activity, WatchLocation, Path

# For interactive plots, uncomment the following line
# %matplotlib widget
import os
import pandas as pd
import pickle
from tqdm import tqdm
from scipy.fft import fft, fftfreq
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from scipy.signal import find_peaks
from scipy.signal import peak_prominences
from sklearn.metrics import mean_absolute_error,accuracy_score,precision_score,recall_score,confusion_matrix,classification_report,f1_score
from multiprocessing import Pool

Load feature extractor for accelerometer, gyroscope and magnetometer

In [2]:
def features_extraction_common(inp): 
    FEATURES = ['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f','avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']

    
    print(FEATURES)
    Median=[];Numneg=[];Numpos=[];Numabovmed=[];
    Min=[];Max=[];Mean=[];Mad=[];Sma=[];Eng=[];Iqr=[];Entr=[];Std=[];Var=[];Kurt=[];Skew=[];Npeaks=[];Avgprom=[]
    Min_d=[];Max_d=[];Mean_d=[];Mad_d=[];Sma_d=[];Eng_d=[];Iqr_d=[];Entr_d=[];Std_d=[];Var_d=[];
    Max_f=[];NPeak_f=[];Avgprom_f=[];Mean_f=[];Skew_f=[];Kurtosis_f=[];Sum_f=[]; Avgpeakdist=[]; Avgpeakdist_f=[];
    
    #X = df.values
    ## TIME DOMAIN ##
    #list of lists of lists, ugly as fuck but it works 
    Median.append(np.median(inp))
    Numneg.append(np.sum(np.array(inp) < 0, axis=0))
    Numpos.append(np.sum(np.array(inp) > 0, axis=0))
    Numabovmed.append(np.sum(np.array(inp) > np.median(inp), axis=0))

    Mean.append(np.mean(inp))
    Std.append(np.std(inp))
    #median absolute deviation
    Mad.append(stats.median_abs_deviation(inp, scale=1))
    Var.append(np.var(inp))
    Min.append(np.min(inp))
    Max.append(np.max(inp))
    #Signal Magnitude Area
    Sma.append(np.sum(inp))
    #energy measure
    Eng.append(np.sum(inp**2)/len(inp))
    Iqr.append(stats.iqr(inp))
    Entr.append(stats.entropy(inp))

    npeaks, _ = find_peaks(inp, distance=5)
    Npeaks.append(len(npeaks))
    prom = peak_prominences(inp, npeaks)
    Avgprom.append(np.mean(prom))

    Apeakdist = 0
    for i in range(len(npeaks)-1):
        Apeakdist += abs(npeaks[i] - npeaks[i+1])

    if len(npeaks) == 1:
        Avgpeakdist.append(0)
    else:
        Avgpeakdist.append(Apeakdist/(len(npeaks)-1))

    ## FREQ DOMAIN ##
    ft = np.abs(fft(inp))
    Sum_f.append(np.sum(ft))
    Max_f.append(np.max(ft))

    npeaks, _ = find_peaks(ft, distance=5)
    NPeak_f.append(len(npeaks))
    prom = peak_prominences(ft, npeaks)
    Avgprom_f.append(np.mean(prom))

    Apeakdist = 0
    for i in range(len(npeaks)-1):
        Apeakdist += abs(npeaks[i] - npeaks[i+1])

    if len(npeaks) == 1:
        Avgpeakdist_f.append(0)
    else:
        Avgpeakdist_f.append(Apeakdist/(len(npeaks)-1))

    Mean_f.append(np.mean(ft))
    Skew_f.append(stats.skew(ft))
    Kurtosis_f.append(stats.kurtosis(ft))

    #derivative
    # f = np.gradient(inp)
    # Mean_d.append(np.mean(inp))
    # Std_d.append(np.std(recording[0]))
    # #median absolute deviation
    # Mad_d.append(stats.median_abs_deviation(recording[0], scale=1))
    # Var_d.append(np.var(recording[0]))
    # Min_d.append(np.min(recording[0]))
    # Max_d.append(np.max(recording[0]))
    # #Signal Magnitude Area
    # Sma_d.append(np.sum(recording[0]))
    # #energy measure
    # Eng_d.append(np.sum(recording[0]**2)/len(recording[0]))
    # Iqr_d.append(stats.iqr(recording[0]))
    # Entr_d.append(stats.entropy(recording[0]))

    #Create dataframe from features
    return np.array([Median, Numneg, Numpos, Numabovmed, Mean,Std, Mad, Var, Min, Max, Sma, Eng, Iqr, Entr, Npeaks, Avgprom, Avgpeakdist, Sum_f, Max_f, NPeak_f, Avgprom_f, Avgpeakdist_f, Mean_f, Skew_f, Kurtosis_f]).reshape(-1) 



In [3]:
#load pickled training 3d norm accelerometer data
file = open('doruks_data/accel_mag_train.pkl', 'rb')
pickled = pickle.load(file)
accel_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['accel'])
print(accel_mag_train.shape)
file.close()

file = open('doruks_data/magneto_mag_train.pkl', 'rb')
pickled = pickle.load(file)
magneto_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['magneto'])
print(magneto_mag_train.shape)
file.close()

file = open('doruks_data/gyro_mag_train.pkl', 'rb')
pickled = pickle.load(file)
gyro_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['gyro'])
print(gyro_mag_train.shape)
file.close()
""" #data = pd.read_pickle('doruks_data/pickled_and_sorted_training_data.pkl.zst')

pos_labels  = []
path_labels = []
for label in data["labels"]:
    path_labels.extend([label["path_idx"]])
    pos_labels.extend([label["watch_loc"]]) """




(396, 1)
(396, 1)
(396, 1)


' #data = pd.read_pickle(\'doruks_data/pickled_and_sorted_training_data.pkl.zst\')\n\npos_labels  = []\npath_labels = []\nfor label in data["labels"]:\n    path_labels.extend([label["path_idx"]])\n    pos_labels.extend([label["watch_loc"]]) '

In [4]:
file = open('doruks_data/accel_gyro_magneto_timestamps.pkl', 'rb')


timestamps = pickle.load(file)

In [5]:
timestamps.shape

(396, 3)

In [6]:
# Seemingly, even though accel/gyro_time and magneto_time is different, the total recording time remains the same

for i in range(0, timestamps.shape[0]):

    print("Difference: ", timestamps["gyro_time"][i][-1] - timestamps["magneto_time"][i][-1])




Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0.0
Difference:  0

In [7]:
unique_one = pd.read_pickle('unique_activity_ONE.pkl')
unique_two = pd.read_pickle('unique_activity_TWO.pkl')
unique_three = pd.read_pickle('unique_activity_THREE.pkl')

unique_all = pd.concat([unique_one, unique_two, unique_three])

In [8]:
unique_three

Unnamed: 0,path_idx,activities,longitude,latitude,speed,altitude,step_count,phone_steps
27,0,[3],"Trace(title='longitude', total_time=249.13, sa...","Trace(title='latitude', total_time=249.13, sam...","Trace(title='speed', total_time=249.13, sample...","Trace(title='altitude', total_time=249.13, sam...",,"Trace(title='phone_steps', total_time=249.13, ..."
46,0,[3],"Trace(title='longitude', total_time=345.38, sa...","Trace(title='latitude', total_time=345.38, sam...","Trace(title='speed', total_time=345.38, sample...","Trace(title='altitude', total_time=345.38, sam...",0.0,
82,0,[3],"Trace(title='longitude', total_time=356.85, sa...","Trace(title='latitude', total_time=356.85, sam...","Trace(title='speed', total_time=356.85, sample...","Trace(title='altitude', total_time=356.85, sam...",0.0,
114,3,[3],"Trace(title='longitude', total_time=187.97, sa...","Trace(title='latitude', total_time=187.97, sam...","Trace(title='speed', total_time=187.97, sample...","Trace(title='altitude', total_time=187.97, sam...",,
154,0,[3],"Trace(title='longitude', total_time=373.50, sa...","Trace(title='latitude', total_time=373.50, sam...","Trace(title='speed', total_time=373.50, sample...","Trace(title='altitude', total_time=373.50, sam...",,
155,1,[3],"Trace(title='longitude', total_time=259.65, sa...","Trace(title='latitude', total_time=259.65, sam...","Trace(title='speed', total_time=259.65, sample...","Trace(title='altitude', total_time=259.65, sam...",,
179,0,[3],"Trace(title='longitude', total_time=186.34, sa...","Trace(title='latitude', total_time=186.34, sam...","Trace(title='speed', total_time=186.34, sample...","Trace(title='altitude', total_time=186.34, sam...",,
184,0,[3],"Trace(title='longitude', total_time=323.34, sa...","Trace(title='latitude', total_time=323.34, sam...","Trace(title='speed', total_time=323.34, sample...","Trace(title='altitude', total_time=323.34, sam...",0.0,"Trace(title='phone_steps', total_time=323.34, ..."
238,0,[3],"Trace(title='longitude', total_time=258.12, sa...","Trace(title='latitude', total_time=258.12, sam...","Trace(title='speed', total_time=258.12, sample...","Trace(title='altitude', total_time=258.12, sam...",,"Trace(title='phone_steps', total_time=258.12, ..."
287,0,[3],"Trace(title='longitude', total_time=205.29, sa...","Trace(title='latitude', total_time=205.29, sam...","Trace(title='speed', total_time=205.29, sample...","Trace(title='altitude', total_time=205.29, sam...",,


In [9]:
unique_indices = unique_all.index
gyro_mag_train_unique = gyro_mag_train.loc[unique_indices]
magneto_mag_train_unique = magneto_mag_train.loc[unique_indices]
accel_mag_train_unique = accel_mag_train.loc[unique_indices]

timestamps_unique = timestamps.loc[unique_indices]

In [10]:
unique_all = pd.concat([unique_all, timestamps_unique, gyro_mag_train_unique, magneto_mag_train_unique, accel_mag_train_unique], axis = 1)

In [11]:
unique_all

Unnamed: 0,path_idx,activities,longitude,latitude,speed,altitude,step_count,phone_steps,accel_time,gyro_time,magneto_time,gyro,magneto,accel
0,2,[1],"Trace(title='longitude', total_time=584.87, sa...","Trace(title='latitude', total_time=584.87, sam...","Trace(title='speed', total_time=584.87, sample...","Trace(title='altitude', total_time=584.87, sam...",,"Trace(title='phone_steps', total_time=584.87, ...","[0.0, 0.004992479791038916, 0.0099849595820778...","[0.0, 0.004992479791038916, 0.0099849595820778...","[0.0, 0.07988990575058053, 0.15977981150116105...","[53.384678630069224, 55.44401057024837, 54.294...","[221.89445350659278, 221.10276820131642, 221.3...","[0.9480597873558152, 0.8834795162112591, 0.826..."
3,2,[1],"Trace(title='longitude', total_time=519.63, sa...","Trace(title='latitude', total_time=519.63, sam...","Trace(title='speed', total_time=519.63, sample...","Trace(title='altitude', total_time=519.63, sam...",,,"[0.0, 0.004998807130282537, 0.0099976142605650...","[0.0, 0.004998807130282537, 0.0099976142605650...","[0.0, 0.07999245689655173, 0.15998491379310345...","[2.1428685815022215, 2.3109384227933973, 2.486...","[248.5280522182176, 248.5280522182176, 248.528...","[0.9713710479087602, 0.9788491672371056, 0.979..."
4,1,[1],"Trace(title='longitude', total_time=625.44, sa...","Trace(title='latitude', total_time=625.44, sam...","Trace(title='speed', total_time=625.44, sample...","Trace(title='altitude', total_time=625.44, sam...",,,"[0.0, 0.005001351405381632, 0.0100027028107632...","[0.0, 0.005001351405381632, 0.0100027028107632...","[0.0, 0.08003122200895713, 0.16006244401791425...","[3.500952797321439, 3.553201645342359, 3.49266...","[271.8695491537745, 271.8695491537745, 269.464...","[0.9858525450565528, 0.9902738944131763, 0.984..."
5,1,[1],"Trace(title='longitude', total_time=499.81, sa...","Trace(title='latitude', total_time=499.81, sam...","Trace(title='speed', total_time=499.81, sample...","Trace(title='altitude', total_time=499.81, sam...",,"Trace(title='phone_steps', total_time=499.81, ...","[0.0, 0.005002932844859514, 0.0100058656897190...","[0.0, 0.005002932844859514, 0.0100058656897190...","[0.0, 0.08005894601954189, 0.16011789203908378...","[1.546642179982124, 1.4506286076538246, 1.6252...","[68.80874601448878, 68.80874601448878, 68.8087...","[1.0085702753031478, 1.0100548807662528, 1.008..."
6,3,[1],"Trace(title='longitude', total_time=473.97, sa...","Trace(title='latitude', total_time=473.97, sam...","Trace(title='speed', total_time=473.97, sample...","Trace(title='altitude', total_time=473.97, sam...",,"Trace(title='phone_steps', total_time=473.97, ...","[0.0, 0.00500227965931758, 0.01000455931863516...","[0.0, 0.00500227965931758, 0.01000455931863516...","[0.0, 0.08004914710352981, 0.16009829420705962...","[13.978758128803513, 11.405993305218088, 7.896...","[61.56632930124062, 62.093556897044365, 62.093...","[0.9529625801153142, 0.9204687610859984, 0.940..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,0,[3],"Trace(title='longitude', total_time=266.08, sa...","Trace(title='latitude', total_time=266.08, sam...","Trace(title='speed', total_time=266.08, sample...","Trace(title='altitude', total_time=266.08, sam...",,,"[0.0, 0.00499418157247696, 0.00998836314495392...","[0.0, 0.00499418157247696, 0.00998836314495392...","[0.0, 0.0799294082306999, 0.1598588164613998, ...","[76.66487316272334, 107.81290341036858, 120.54...","[282.52652840340943, 282.52652840340943, 282.5...","[1.6440202830177664, 1.762844489056376, 1.7870..."
298,0,[3],"Trace(title='longitude', total_time=225.04, sa...","Trace(title='latitude', total_time=225.04, sam...","Trace(title='speed', total_time=225.04, sample...","Trace(title='altitude', total_time=225.04, sam...",,,"[0.0, 0.005009103657043649, 0.0100182073140872...","[0.0, 0.005009103657043649, 0.0100182073140872...","[0.0, 0.08017242607766299, 0.16034485215532598...","[2.28469478182336, 2.1848327270749186, 2.28825...","[195.04746867735466, 195.04746867735466, 191.9...","[0.9995407295266949, 1.0021092268188303, 1.008..."
315,0,[3],"Trace(title='longitude', total_time=316.26, sa...","Trace(title='latitude', total_time=316.26, sam...","Trace(title='speed', total_time=316.26, sample...","Trace(title='altitude', total_time=316.26, sam...",0.0,,"[0.0, 0.0050042089273564455, 0.010008417854712...","[0.0, 0.0050042089273564455, 0.010008417854712...","[0.0, 0.08008635097493037, 0.16017270194986075...","[6.040669163332798, 5.574075555993016, 4.99652...","[268.1602639812259, 267.4256862162783, 267.425...","[1.064193419487862, 1.0668586880457207, 1.0613..."
329,1,[3],"Trace(title='longitude', total_time=238.81, sa...","Trace(title='latitude', total_time=238.81, sam...","Trace(title='speed', total_time=238.81, sample...","Trace(title='altitude', total_time=238.81, sam...",,,"[0.0, 0.005003750497621891, 0.0100075009952437...","[0.0, 0.005003750497621891, 0.0100075009952437...","[0.0, 0.08008517773306506, 0.16017035546613012...","[2.9132700882378435, 2.8139540126473555, 2.786...","[218.79273839657841, 218.79273839657841, 216.8...","[0.9941350073806781, 1.0073576174824759, 0.991..."


In [12]:
""" index = unique_indices

features_of_interest = np.array(["gyro", "magneto", "accel"])



extracted_data_dictionary = {}

for sample_index, sample in unique_all.iterrows():

    print(sample_index)
    feature_dictionary = {}

    for feature_name in features_of_interest:

        sample = unique_all.loc[sample_index][feature_name]


        print("Feature Name:", feature_name)
        print(" Time           Value")

        # Initialize a dictionary to store steps for each second
        values_per_second = {}


        if sample is not None:

            # Iterate over the timestamps and values
            for t, x in zip(sample.timestamps, sample.values):
                # Get the second part of the timestamp as the key
                #if int(t) % 10 == 0 AND #no other 10er in dict:  
                 #   print("10er:")

                second = int(t) #put in if statement for other intervals of pooling
                #print("second:", second, "real: ", t)
                # If the second is not in the dictionary, initialize it with an empty list
                if second not in values_per_second:
                    values_per_second[second] = np.array([])
                # Add the steps to the list for the current second
                values_per_second[second] = np.append(values_per_second[second], x)

            seconds_array = np.array([])
            averages_array = np.array([])

            # Calculate the average steps for each second

            
            for second, values in values_per_second.items():
                average_values = features_extraction_common(values) #np.average(values) #sum(values) / len(values)
                #print(f"{second}s \t{average_values:.5f} steps")

                seconds_array = np.append(seconds_array, second)
                averages_array = np.append(averages_array, average_values)
                

            #das pooled alles in eine sekunde jeweils, eine überlegung wert
            
            feature_dictionary["seconds"] = seconds_array
            feature_dictionary[feature_name]  = averages_array

        else:
            feature_dictionary[feature_name]  = np.array([])            

    #print(feature_dictionary)

    extracted_data_dictionary[sample_index] = feature_dictionary """

' index = unique_indices\n\nfeatures_of_interest = np.array(["gyro", "magneto", "accel"])\n\n\n\nextracted_data_dictionary = {}\n\nfor sample_index, sample in unique_all.iterrows():\n\n    print(sample_index)\n    feature_dictionary = {}\n\n    for feature_name in features_of_interest:\n\n        sample = unique_all.loc[sample_index][feature_name]\n\n\n        print("Feature Name:", feature_name)\n        print(" Time           Value")\n\n        # Initialize a dictionary to store steps for each second\n        values_per_second = {}\n\n\n        if sample is not None:\n\n            # Iterate over the timestamps and values\n            for t, x in zip(sample.timestamps, sample.values):\n                # Get the second part of the timestamp as the key\n                #if int(t) % 10 == 0 AND #no other 10er in dict:  \n                 #   print("10er:")\n\n                second = int(t) #put in if statement for other intervals of pooling\n                #print("second:", second, "r

In [13]:
unique_all.loc[0]["gyro_time"]


array([0.00000000e+00, 4.99247979e-03, 9.98495958e-03, ...,
       5.84864015e+02, 5.84869008e+02, 5.84874000e+02])

In [14]:
unique_all["gyro_time"][0]


array([0.00000000e+00, 4.99247979e-03, 9.98495958e-03, ...,
       5.84864015e+02, 5.84869008e+02, 5.84874000e+02])

In [15]:
unique_all["altitude"][0].timestamps

array([0.00000000e+00, 7.98899058e-02, 1.59779812e-01, ...,
       5.84714220e+02, 5.84794110e+02, 5.84874000e+02])

Without Pooling but with 10s Chunks

In [16]:
index = unique_indices

features_of_interest = np.array(["gyro", "magneto", "accel", "altitude"])

chunk_size = 10

skip = 1

extracted_data_dictionary = {}

counter = 0

for sample_index, sample in unique_all.iterrows():

    print("Sample_index: ", sample_index)
    feature_dictionary = {}

    counter = counter + 1

    print("Progress: ", counter/unique_all.shape[0], "%")

    #print("Sample: ", sample)

    for feature_name in features_of_interest:

        if feature_name == "altitude":
            sample_timestamps = unique_all["altitude"][sample_index].values
        
        else:
            sample_values = unique_all.loc[sample_index][feature_name]

        #print(feature_name)

        time_name = f'{feature_name}_time'

        #print(time_name)
        #time_column = unique_all[sample_index].filter(regex=f'{feature_name}.*_time$').columns

        if feature_name == "altitude":
            sample_timestamps = unique_all["altitude"][sample_index].timestamps

        else:
            sample_timestamps = unique_all[time_name][sample_index]

        #print("sample: ", sample)

        print("Feature Name:", feature_name, time_name)
        #print(" Time           Value")

        # Initialize a dictionary to store steps for each second
        values_per_interval = {}

        if sample_values is not None:

            # Iterate over the timestamps and values
            for t, x in zip(sample_timestamps[::skip], sample_values[::skip]):
                # Get the second part of the timestamp as the key

                second = int(t) #put in if statement for other intervals of pooling


                if second % chunk_size == 0 and second not in values_per_interval:
                    values_per_interval[second] = np.array([[], []])

                    #update the second
                    last_second = second

                    print("Chunk:", last_second)             


                # Add the timestamps and values to the list for the current interval
                
                #print("Timestamp:", t)
                #print("Value:", x)

                new_value = np.array([[t], [x]])

                values_per_interval[last_second] = np.concatenate((values_per_interval[last_second], new_value), axis = 1)

            # Calculate the average steps for each second

            
            intervals_array = np.array([])
            embeddings_array = np.empty((0, 25))
            #embeddings = np.empty((0,25))

            for interval, values in values_per_interval.items():
                
                

                #print("Just checking if this is actually a series", values[1])
                average_values = features_extraction_common(values[1]) #np.average(values) #sum(values) / len(values)
                #print(f"{second}s \t{average_values:.5f} steps")
                
                print(embeddings_array.shape)
                print(average_values.shape)
                #print(average_values)

                intervals_array = np.append(intervals_array, interval)
                #embeddings_array = np.append(embeddings_array, average_values) this flattens everything. we want a 2d array though
                #add "average_values" to the embeddings_array in a nicer way
                embeddings_array = np.concatenate((embeddings_array, average_values[np.newaxis, :]), axis=0)
            
                
                

            #das pooled alles in eine sekunde jeweils, eine überlegung wert
            
            feature_dictionary["intervals"] = intervals_array

            feature_dictionary[feature_name]  = embeddings_array

        else:
            feature_dictionary[feature_name]  = np.array([])            

    #print(feature_dictionary)

    extracted_data_dictionary[sample_index] = feature_dictionary

Sample_index:  0
Progress:  0.004201680672268907 %
Feature Name: gyro gyro_time
Chunk: 0
Chunk: 10
Chunk: 20
Chunk: 30
Chunk: 40
Chunk: 50
Chunk: 60
Chunk: 70
Chunk: 80
Chunk: 90
Chunk: 100
Chunk: 110
Chunk: 120
Chunk: 130
Chunk: 140
Chunk: 150
Chunk: 160
Chunk: 170
Chunk: 180
Chunk: 190
Chunk: 200
Chunk: 210
Chunk: 220
Chunk: 230
Chunk: 240
Chunk: 250
Chunk: 260
Chunk: 270
Chunk: 280
Chunk: 290
Chunk: 300
Chunk: 310
Chunk: 320
Chunk: 330
Chunk: 340
Chunk: 350
Chunk: 360
Chunk: 370
Chunk: 380
Chunk: 390
Chunk: 400
Chunk: 410
Chunk: 420
Chunk: 430
Chunk: 440
Chunk: 450
Chunk: 460
Chunk: 470
Chunk: 480
Chunk: 490
Chunk: 500
Chunk: 510
Chunk: 520
Chunk: 530
Chunk: 540
Chunk: 550
Chunk: 560
Chunk: 570
Chunk: 580
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f', 'avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']
(0, 25)
(25,)
['Median', 'Numn

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Chunk: 210
Chunk: 220
Chunk: 230
Chunk: 240
Chunk: 250
Chunk: 260
Chunk: 270
Chunk: 280
Chunk: 290
Chunk: 300
Chunk: 310
Chunk: 320
Chunk: 330
Chunk: 340
Chunk: 350
Chunk: 360
Chunk: 370
Chunk: 380
Chunk: 390
Chunk: 400
Chunk: 410
Chunk: 420
Chunk: 430
Chunk: 440
Chunk: 450
Chunk: 460
Chunk: 470
Chunk: 480
Chunk: 490
Chunk: 500
Chunk: 510
Chunk: 520
Chunk: 530
Chunk: 540
Chunk: 550
Chunk: 560
Chunk: 570
Chunk: 580
Chunk: 590
Chunk: 600
Chunk: 610
Chunk: 620
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f', 'avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']
(0, 25)
(25,)
['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f', 'avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']
(1, 25)
(25,)
['Medi

In [None]:
print(len(extracted_data_dictionary[0]["intervals"]))
print((extracted_data_dictionary[0]["gyro"].shape))

59
(59, 25)


Extracting Data

In [None]:
sample_incdex = 0

extracted_data_dictionary[0]["gyro"].shape

(59, 25)

In [None]:
FEATURES = ['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f','avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']

features_of_interest = np.array(["gyro", "magneto", "accel"])

sample0_gyro_df = pd.DataFrame(extracted_data_dictionary[0]["gyro"], columns=[f"{features_of_interest[0]}_{feature}" for feature in FEATURES])
sample0_magneto_df = pd.DataFrame(extracted_data_dictionary[0]["magneto"], columns=[f"{features_of_interest[1]}_{feature}" for feature in FEATURES])
sample0_accel_df = pd.DataFrame(extracted_data_dictionary[0]["accel"], columns=[f"{features_of_interest[2]}_{feature}" for feature in FEATURES])


pd.read_pickle("labels.pkl")
result_df = pd.concat([sample0_gyro_df, sample0_magneto_df, sample0_accel_df], axis=1)

print(result_df.shape)




FileNotFoundError: [Errno 2] No such file or directory: 'labels.pkl'

In [None]:
result_df["y-value"] = unique_all["activities"][0][0] #the activity is in a list but as we selected for, it only has one value of course
result_df["index"] = 0

In [None]:
watch_location["smartwatch_location"]

KeyError: 'smartwatch_location'

In [None]:
result_df.shape

(61, 78)

In [None]:
for i in unique_indices[1:]:

    sample_gyro_df = pd.DataFrame(extracted_data_dictionary[i]["gyro"], columns=[f"{features_of_interest[0]}_{feature}" for feature in FEATURES])
    sample_magneto_df = pd.DataFrame(extracted_data_dictionary[i]["magneto"], columns=[f"{features_of_interest[1]}_{feature}" for feature in FEATURES])
    sample_accel_df = pd.DataFrame(extracted_data_dictionary[i]["accel"], columns=[f"{features_of_interest[2]}_{feature}" for feature in FEATURES])

    inter_result_df = pd.concat([sample_gyro_df, sample_magneto_df, sample_accel_df], axis=1)
    

    inter_result_df["y-value"] = unique_all["activities"][i][0] #the activity is in a list but as we selected for, it only has one value of course
    inter_result_df["index"] = i

    result_df = pd.concat([result_df, inter_result_df], axis = 0)

print(unique_indices)

Index([  0,   3,   4,   5,   6,   7,   8,   9,  10,  11,
       ...
       155, 179, 184, 238, 287, 288, 298, 315, 329, 361],
      dtype='int64', length=238)


In [None]:
result_df


Unnamed: 0,gyro_Median,gyro_Numneg,gyro_Numpos,gyro_Numabovmed,gyro_Mean,gyro_STD,gyro_MAD,gyro_Var,gyro_Min,gyro_Max,...,accel_Sum_f,accel_Max_f,accel_NPeak_f,accel_Avgprom_f,accel_avgpeakdist_f,accel_Mean_f,accel_Skew_f,accel_Kurtosis_f,y-value,index
0,54.272677,0.0,2004.0,1002.0,61.632218,39.549655,19.557371,1564.175197,3.065802,314.741466,...,12958.714775,2194.462609,265.0,670.737716,7.553030,6.466425,39.065561,1647.656257,1,0
1,53.758711,0.0,2003.0,1001.0,64.425184,44.136524,21.167073,1948.032714,5.156500,327.969782,...,12691.077201,2214.583086,282.0,671.067908,7.099644,6.336035,38.663215,1606.105288,1,0
2,50.107900,0.0,2003.0,1001.0,55.453514,28.247561,15.758644,797.924724,5.572321,221.049537,...,10685.196617,2204.877673,281.0,670.673896,7.132143,5.334596,38.537057,1601.959024,1,0
3,51.758361,0.0,2003.0,1001.0,55.237357,26.413107,17.776483,697.652247,5.543751,186.132838,...,10601.356463,2215.323159,282.0,670.183337,7.092527,5.292739,38.819534,1628.081087,1,0
4,40.639799,0.0,2003.0,1001.0,44.062067,22.388116,14.303673,501.227729,3.985262,164.169329,...,8866.052087,2158.004586,278.0,669.188461,7.216606,4.426386,41.793621,1820.742813,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,38.698179,0.0,2001.0,1000.0,49.082640,38.408065,17.807631,1475.179441,3.838548,412.682475,...,16709.072114,2177.481530,295.0,669.855450,6.792517,8.350361,42.451668,1862.456847,3,361
29,35.609676,0.0,2001.0,1000.0,55.785928,56.987470,19.392247,3247.571720,1.020515,351.748068,...,14466.421297,2084.085706,301.0,669.393316,6.656667,7.229596,42.654471,1874.380984,3,361
30,44.562376,0.0,2001.0,1000.0,54.802065,60.630815,19.249553,3676.095767,2.103943,580.529834,...,14569.421761,2116.429651,285.0,669.454923,7.031690,7.281070,42.934743,1891.111021,3,361
31,56.818692,0.0,2001.0,1000.0,99.130849,120.259174,51.136478,14462.269048,0.227351,645.041674,...,14422.299118,2130.735589,268.0,669.245747,7.479401,7.207546,42.422839,1860.227647,3,361


Classification

In [None]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image


In [None]:
columns_to_exclude = ['y-value', 'index']

X = result_df.drop(columns=columns_to_exclude)
y = result_df["y-value"]

In [None]:
nones_df = X[X.isnull().any(axis=1)]

In [None]:
print(nones_df)

    gyro_Median  gyro_Numneg  gyro_Numpos  gyro_Numabovmed   gyro_Mean  \
62    36.725629          0.0       1089.0            544.0   37.418000   
49    75.584933          0.0        637.0            318.0   81.962917   
54   251.740683          0.0         32.0             16.0  245.643611   
66    93.446963          0.0         50.0             25.0   90.757087   
60    55.359648          0.0         55.0             27.0   61.757285   
6    137.406883          0.0       2003.0           1001.0  168.827007   
64   142.997604          0.0        675.0            337.0  171.741264   
40    42.320000          0.0       1999.0            999.0   43.862760   
44    38.229178          0.0       1999.0            999.0   41.134617   
50    44.880830          0.0       1999.0            999.0   46.630059   
55    37.153181          0.0       2000.0           1000.0   39.219695   
57    51.303580          0.0       1999.0            999.0   57.924840   
60    56.613509          0.0       199

In [None]:
from sklearn.impute import SimpleImputer

# Assuming X is your feature matrix with NaN values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)


In [None]:
import joblib

imputer_accelereromter_filename = "group32_model_imputer_segment.joblib"
joblib.dump(imputer, imputer_accelereromter_filename)

['group32_model_imputer_segment.joblib']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2)

In [None]:
rf = RandomForestClassifier(random_state=12)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print(y_pred)

[1 1 1 ... 1 1 1]


In [None]:
import joblib
joblib.dump(rf, "10sec_segments_walkrunbicycle.joblib")

['10sec_segments_walkrunbicycle.joblib']

In [None]:
from collections import defaultdict

def calculate_class_accuracy(y_test, y_predict):
    class_counts = defaultdict(int)
    correct_counts = defaultdict(int)

    for true_label, pred_label in zip(y_test, y_predict):
        class_counts[true_label] += 1
        if pred_label == true_label:
            correct_counts[true_label] += 1

    class_accuracy = {}
    for cls in range(1, 4):  # Assuming classes are 1, 2, and 3
        if class_counts[cls] == 0:
            class_accuracy[cls] = 0.0
        else:
            class_accuracy[cls] = (correct_counts[cls] / class_counts[cls]) * 100

        print(correct_counts[cls])
    return class_accuracy

# Example usage:

class_accuracy = calculate_class_accuracy(y_test, y_pred)
for cls, acc in class_accuracy.items():
    print(f"Class {cls}: Accuracy {acc:.2f}%")


2498
43
69
Class 1: Accuracy 99.96%
Class 2: Accuracy 91.49%
Class 3: Accuracy 86.25%


In [None]:
from sklearn.metrics import accuracy_score


accuracy_score(y_test, y_pred)

0.993907083015994

In [None]:
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

Prediction by Class