In [1]:
import numpy as np
import matplotlib.pyplot as plt

# You may change the mhealth_activity module but your algorithm must support the original version
from mhealth_activity import Recording, Trace, Activity, WatchLocation, Path

# For interactive plots, uncomment the following line
# %matplotlib widget
import os
import pandas as pd
import pickle
from tqdm import tqdm
from scipy.fft import fft, fftfreq
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error,accuracy_score,precision_score,recall_score,confusion_matrix,classification_report,f1_score


In [2]:
#read data
create_data_pickle = False
if create_data_pickle:
    files = os.listdir('data/train')
    list_of_dicts = []
    types_to_include = ['ax', 'ay', 'az', 'phone_ax', 'phone_ay', 'phone_az', 'speed', 'longitude', 'latitude', 'altitude', 'phone_steps']

    for file in tqdm(files):
        Dict = {}
        d = Recording(os.path.join('data/train',file))

        Dict['labels'] = d.labels
        for data_type in types_to_include:
            if data_type in d.data.keys():
                Dict[data_type] = d.data[data_type]
        list_of_dicts.append(Dict)

    data = pd.DataFrame(list_of_dicts)
    data.to_pickle(path='data/pickled_and_sorted_training_data.pkl.zst', compression={'method': 'zstd'})
else:
    data = pd.read_pickle('data/pickled_and_sorted_training_data.pkl.zst')


In [3]:
#load pickled training 3d norm accelerometer data
import pickle
import pandas as pd

file = open('data/accel_mag_train.pkl', 'rb')
pickled = pickle.load(file)
accel_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['lists'])
print(accel_mag_train.shape)
file.close()

labels = []
for label in data["labels"]:
    labels.extend([[label["path_idx"],label["watch_loc"]]])
labels = pd.DataFrame(labels, columns =['path_idx', "smartwatch_location"])
labels



(396, 1)


Unnamed: 0,path_idx,smartwatch_location
0,1,0
1,4,0
2,2,0
3,3,0
4,1,0
...,...,...
391,4,2
392,3,2
393,3,2
394,0,1


In [22]:
altitude_data = [x.values for x in data["altitude"]]

In [5]:
dataframe_altitude = pd.DataFrame(altitude_data)

In [14]:
def features_extraction(df): 
    
    FEATURES = ['MIN','MAX','MEAN','RMS','VAR','STD','POWER','PEAK','P2P','CREST FACTOR','SKEW','KURTOSIS',
            'MAX_f','SUM_f','MEAN_f','VAR_f','PEAK_f','SKEW_f','KURTOSIS_f']
    
    Min=[];Max=[];Mean=[];Rms=[];Var=[];Std=[];Power=[];Peak=[];Skew=[];Kurtosis=[];P2p=[];CrestFactor=[];
    FormFactor=[]; PulseIndicator=[];
    Max_f=[];Sum_f=[];Mean_f=[];Var_f=[];Peak_f=[];Skew_f=[];Kurtosis_f=[]
    
    X = df.values
    ## TIME DOMAIN ##
    #list of lists of lists, ugly as fuck but it works 
    for recording in X:
        Min.append(np.min(recording[0]))
        Max.append(np.max(recording[0]))
        Mean.append(np.mean(recording[0]))
        Rms.append(np.sqrt(np.mean(recording[0]**2)))
        Var.append(np.var(recording[0]))
        Std.append(np.std(recording[0]))
        Power.append(np.mean(recording[0]**2))
        Peak.append(np.max(np.abs(recording[0])))
        P2p.append(np.ptp(recording[0]))
        CrestFactor.append(np.max(np.abs(recording[0]))/np.sqrt(np.mean(recording[0]**2)))
        Skew.append(stats.skew(recording[0]))
        Kurtosis.append(stats.kurtosis(recording[0]))
        FormFactor.append(np.sqrt(np.mean(recording[0]**2))/np.mean(recording[0]))
        PulseIndicator.append(np.max(np.abs(recording[0]))/np.mean(recording[0]))
        ## FREQ DOMAIN ##
        ft = fft(recording[0])
        S = np.abs(ft**2)/len(df)
        Max_f.append(np.max(S))
        Sum_f.append(np.sum(S))
        Mean_f.append(np.mean(S))
        Var_f.append(np.var(S))
        
        Peak_f.append(np.max(np.abs(S)))
        Skew_f.append(stats.skew(recording[0]))
        Kurtosis_f.append(stats.kurtosis(recording[0]))

    
    #Create dataframe from features
    df_features = pd.DataFrame(index = FEATURES, 
                               data = [Min,Max,Mean,Rms,Var,Std,Power,Peak,P2p,CrestFactor,Skew,Kurtosis,
                                       Max_f,Sum_f,Mean_f,Var_f,Peak_f,Skew_f,Kurtosis_f])
    return df_features

In [7]:
#extract features from accelerometer norm and get rid of multiindex
features = pd.DataFrame.transpose(features_extraction(accel_mag_train))
features.columns = features.columns.map(''.join)
features

Unnamed: 0,MIN,MAX,MEAN,RMS,VAR,STD,POWER,PEAK,P2P,CREST FACTOR,SKEW,KURTOSIS,MAX_f,SUM_f,MEAN_f,VAR_f,PEAK_f,SKEW_f,KURTOSIS_f
0,0.153406,2.597480,1.202418,1.221131,0.045352,0.212961,1.491161,2.597480,2.444074,2.127110,0.176443,0.543339,5.469889e+07,5.641470e+07,460.904406,2.444401e+10,5.469889e+07,0.176443,0.543339
1,0.046882,2.826338,1.092686,1.149698,0.127844,0.357553,1.321806,2.826338,2.779456,2.458331,0.346138,-0.193552,4.173153e+07,4.619995e+07,392.696408,1.480367e+10,4.173153e+07,0.346138,-0.193552
2,0.083755,3.057903,1.093724,1.135764,0.093728,0.306151,1.289960,3.057903,2.974148,2.692375,1.456056,3.479315,4.175404e+07,4.502559e+07,382.974880,1.482894e+10,4.175404e+07,1.456056,3.479315
3,0.276498,2.088655,1.026057,1.083954,0.122162,0.349516,1.174955,2.088655,1.812157,1.926886,0.348455,-0.611317,3.086272e+07,3.444390e+07,319.682780,8.841351e+09,3.086272e+07,0.348455,-0.611317
4,0.080601,3.464102,1.078819,1.119106,0.088548,0.297570,1.252399,3.464102,3.383500,3.095418,2.066954,9.037060,1.734226e+07,1.866169e+07,242.940102,3.915198e+09,1.734226e+07,2.066954,9.037060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,0.093611,3.464102,1.551828,1.678641,0.409665,0.640051,2.817836,3.464102,3.370491,2.063635,0.761946,-0.359174,8.212287e+07,9.609317e+07,826.906681,5.804152e+10,8.212287e+07,0.761946,-0.359174
392,0.025811,3.464102,1.468600,1.625606,0.485809,0.697000,2.642595,3.464102,3.438291,2.130960,0.905103,-0.054596,7.030573e+07,8.614188e+07,758.184439,4.351259e+10,7.030573e+07,0.905103,-0.054596
393,0.034486,3.464102,1.294628,1.425602,0.356279,0.596891,2.032342,3.464102,3.429616,2.429922,1.162461,0.824875,1.761473e+07,2.135908e+07,331.086919,4.811941e+09,1.761473e+07,1.162461,0.824875
394,0.032321,3.464066,0.996864,1.078101,0.168563,0.410565,1.162301,3.464066,3.431746,3.213119,0.854506,1.884827,4.478016e+07,5.237601e+07,392.082963,1.501348e+10,4.478016e+07,0.854506,1.884827


In [62]:
from scipy.signal import find_peaks

from scipy.signal import peak_prominences

def altitude_features(altitude_data=altitude_data):
    features = []
    fnames = ['min', 'max', 'amp', 'sum', 'npeaks', 'prom', 'min_d1', 'max_d1','sum_d1', 'sum_d2']

    for recording in altitude_data:
        row = []
        d1 = np.diff(recording)
        d2 = np.diff(recording, n=2)

        row.append(np.min(recording))
        row.append(np.max(recording))
        row.append(np.max(recording)-np.min(recording))
        row.append(np.sum(recording))

        npeaks, _ = find_peaks(recording, distance=5)
        row.append(len(npeaks))
        row.append(np.mean(peak_prominences(recording, npeaks)))

        row.append(np.min(d1))
        row.append(np.max(d1))

        row.append(np.sum(d1))    
        row.append(np.sum(d2))
        features.append(row)

    df_features = pd.DataFrame(columns = [fnames], data = features) 
    # df_features = pd.DataFrame.transpose(df_features)
    df_features.columns = df_features.columns.map(''.join)
    # get rid of multiindex
    return df_features

In [44]:
altitude_features(altitude_data)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,min,max,amp,sum,npeaks,prom,mean_d1,mean_d2
0,363.209267,489.611902,126.402635,3.310124e+06,35,1341.988645,0.006428,0.000000e+00
1,403.254415,453.629116,50.374702,3.130887e+06,135,3008.030146,-0.005804,0.000000e+00
2,405.691786,451.882854,46.191069,3.176462e+06,130,1943.570411,0.005714,0.000000e+00
3,406.698784,459.946014,53.247230,2.915126e+06,120,2777.308659,-0.007887,0.000000e+00
4,461.610291,496.608154,34.997864,2.383271e+06,0,,0.007291,0.000000e+00
...,...,...,...,...,...,...,...,...
391,450.829590,511.399994,60.570404,3.453641e+06,30,3981.797312,-0.007344,0.000000e+00
392,405.261355,452.322450,47.061095,3.043863e+06,149,2900.536349,-0.006071,-2.098920e-09
393,491.016724,494.588318,3.571594,1.979841e+06,0,,-0.000886,0.000000e+00
394,409.290619,455.560179,46.269560,3.602354e+06,47,2034.069566,0.004804,-9.584856e-06


In [63]:
total = pd.concat([features,altitude_features(altitude_data)],axis=1)
total

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,MIN,MAX,MEAN,RMS,VAR,STD,POWER,PEAK,P2P,CREST FACTOR,...,min,max,amp,sum,npeaks,prom,min_d1,max_d1,sum_d1,sum_d2
0,0.153406,2.597480,1.202418,1.221131,0.045352,0.212961,1.491161,2.597480,2.444074,2.127110,...,363.209267,489.611902,126.402635,3.310124e+06,35,1341.988645,-38.681855,55.891466,49.170742,0.000000
1,0.046882,2.826338,1.092686,1.149698,0.127844,0.357553,1.321806,2.826338,2.779456,2.458331,...,403.254415,453.629116,50.374702,3.130887e+06,135,3008.030146,-2.806039,1.383844,-42.669175,0.000000
2,0.083755,3.057903,1.093724,1.135764,0.093728,0.306151,1.289960,3.057903,2.974148,2.692375,...,405.691786,451.882854,46.191069,3.176462e+06,130,1943.570411,-1.455307,1.298345,41.979488,0.000000
3,0.276498,2.088655,1.026057,1.083954,0.122162,0.349516,1.174955,2.088655,1.812157,1.926886,...,406.698784,459.946014,53.247230,2.915126e+06,120,2777.308659,-9.378391,3.438049,-53.100384,0.000000
4,0.080601,3.464102,1.078819,1.119106,0.088548,0.297570,1.252399,3.464102,3.383500,3.095418,...,461.610291,496.608154,34.997864,2.383271e+06,0,,0.000000,34.997864,34.997864,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,0.093611,3.464102,1.551828,1.678641,0.409665,0.640051,2.817836,3.464102,3.370491,2.063635,...,450.829590,511.399994,60.570404,3.453641e+06,30,3981.797312,-15.765167,6.300018,-53.329437,0.000000
392,0.025811,3.464102,1.468600,1.625606,0.485809,0.697000,2.642595,3.464102,3.438291,2.130960,...,405.261355,452.322450,47.061095,3.043863e+06,149,2900.536349,-1.566091,1.959657,-43.106279,-0.000015
393,0.034486,3.464102,1.294628,1.425602,0.356279,0.596891,2.032342,3.464102,3.429616,2.429922,...,491.016724,494.588318,3.571594,1.979841e+06,0,,-1.843201,0.000000,-3.571594,0.000000
394,0.032321,3.464066,0.996864,1.078101,0.168563,0.410565,1.162301,3.464066,3.431746,3.213119,...,409.290619,455.560179,46.269560,3.602354e+06,47,2034.069566,-30.223243,8.023616,40.101194,-0.080005


In [66]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

def split_and_train(X_train,y_train):

    X_train.columns = [''] * len(X_train.columns)
    X_train.fillna(0, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(X_train.to_numpy(), y_train.to_numpy(), test_size=0.2, random_state=10)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train) 
    X_test = scaler.fit_transform(X_test) 

    param_dist = {'n_estimators': randint(50,1000),
              'max_depth': randint(1,20)}

    # Instantiate model with 1000 decision trees, use all cores
    rf = RandomForestClassifier()

    # Use random search to find the best hyperparameters
    rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=10, 
                                 cv=5, n_jobs=-1)

    # Fit the random search object to the data
    rand_search.fit(X_train, y_train)

    y_pred = np.round(rand_search.predict(X_test),0)

    print(y_pred)
    return rand_search,y_pred,y_test,X_train,X_test

rf_model,y_pred,y_test,X_train,X_test = split_and_train(total,labels["path_idx"])

[4 1 0 1 1 0 4 2 1 0 1 0 0 1 4 0 4 3 2 1 1 3 3 1 0 0 3 1 3 1 1 2 4 1 4 3 1
 1 1 3 4 3 3 0 1 4 3 3 1 1 3 0 1 4 0 4 3 0 0 2 4 4 4 3 4 3 0 0 4 0 4 4 3 0
 1 2 3 4 0 3]


In [67]:
print(accuracy_score(y_pred,y_test))

0.375
