In [13]:
import numpy as np
import matplotlib.pyplot as plt

# You may change the mhealth_activity module but your algorithm must support the original version
from mhealth_activity import Recording, Trace, Activity, WatchLocation, Path

# For interactive plots, uncomment the following line
# %matplotlib widget
import os
import pandas as pd
import pickle
from tqdm import tqdm
from scipy.fft import fft, fftfreq
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from scipy.signal import find_peaks
from scipy.signal import peak_prominences
from sklearn import svm
from sklearn.metrics import mean_absolute_error,accuracy_score,precision_score,recall_score,confusion_matrix,classification_report,f1_score
from multiprocessing import Pool
import joblib

In [8]:
#read data

data = pd.read_pickle('data/pickled_and_sorted_training_data.pkl.zst')
data = data[['temperature','labels']]

In [3]:
def features_extraction_common(inp): 
    df, prefix = inp
    FEATURES = ['Median', 'Numneg', 'Numpos', 'Numabovmed', 'Mean', 'STD', 'MAD', 'Var', 'Min', 'Max', 'SMA', 'Energy', 'IQR', 'Entropy', 'Npeaks', 'avgprom', 'avgpeakdist', 'Sum_f', 'Max_f', 'NPeak_f', 'Avgprom_f','avgpeakdist_f', 'Mean_f', 'Skew_f', 'Kurtosis_f']

    for idx, feature in enumerate(FEATURES):
        FEATURES[idx] = FEATURES[idx] + '_' + prefix
    
    print(FEATURES)
    Median=[];Numneg=[];Numpos=[];Numabovmed=[];
    Min=[];Max=[];Mean=[];Mad=[];Sma=[];Eng=[];Iqr=[];Entr=[];Std=[];Var=[];Kurt=[];Skew=[];Npeaks=[];Avgprom=[]
    Min_d=[];Max_d=[];Mean_d=[];Mad_d=[];Sma_d=[];Eng_d=[];Iqr_d=[];Entr_d=[];Std_d=[];Var_d=[];
    Max_f=[];NPeak_f=[];Avgprom_f=[];Mean_f=[];Skew_f=[];Kurtosis_f=[];Sum_f=[]; Avgpeakdist=[]; Avgpeakdist_f=[];
    
    X = df.values
    ## TIME DOMAIN ##
    #list of lists of lists, ugly as fuck but it works 
    for recording in X:
        Median.append(np.median(recording[0]))
        Numneg.append(np.sum(np.array(recording[0]) < 0, axis=0))
        Numpos.append(np.sum(np.array(recording[0]) > 0, axis=0))
        Numabovmed.append(np.sum(np.array(recording[0]) > np.median(recording[0]), axis=0))

        Mean.append(np.mean(recording[0]))
        Std.append(np.std(recording[0]))
        #median absolute deviation
        Mad.append(stats.median_abs_deviation(recording[0], scale=1))
        Var.append(np.var(recording[0]))
        Min.append(np.min(recording[0]))
        Max.append(np.max(recording[0]))
        #Signal Magnitude Area
        Sma.append(np.sum(recording[0]))
        #energy measure
        Eng.append(np.sum(recording[0]**2)/len(recording[0]))
        Iqr.append(stats.iqr(recording[0]))
        Entr.append(stats.entropy(recording[0]))

        npeaks, _ = find_peaks(recording[0], distance=5)
        Npeaks.append(len(npeaks))
        prom = peak_prominences(recording[0], npeaks)
        Avgprom.append(np.mean(prom))

        Apeakdist = 0
        for i in range(len(npeaks)-1):
            Apeakdist += abs(npeaks[i] - npeaks[i+1])
        Avgpeakdist.append(Apeakdist/(len(npeaks)-1))

        ## FREQ DOMAIN ##
        ft = np.abs(fft(recording[0]))
        Sum_f.append(np.sum(ft))
        Max_f.append(np.max(ft))

        npeaks, _ = find_peaks(ft, distance=5)
        NPeak_f.append(len(npeaks))
        prom = peak_prominences(ft, npeaks)
        Avgprom_f.append(np.mean(prom))

        Apeakdist = 0
        for i in range(len(npeaks)-1):
            Apeakdist += abs(npeaks[i] - npeaks[i+1])
        Avgpeakdist_f.append(Apeakdist/(len(npeaks)-1))

        Mean_f.append(np.mean(ft))
        Skew_f.append(stats.skew(ft))
        Kurtosis_f.append(stats.kurtosis(ft))

    #Create dataframe from features
    df_features = pd.DataFrame(index = [FEATURES], 
                               data = [Median, Numneg, Numpos, Numabovmed, Mean,Std, Mad, Var, Min, Max, Sma, Eng, Iqr, Entr, Npeaks, Avgprom, Avgpeakdist, Sum_f, Max_f, NPeak_f, Avgprom_f, Avgpeakdist_f, Mean_f, Skew_f, Kurtosis_f]) 
    df_features = pd.DataFrame.transpose(df_features)
    df_features.columns = df_features.columns.map(''.join)
    # get rid of multiindex
    return df_features

# def features_extraction_path(df, prefix): 



In [4]:
#load pickled training 3d norm accelerometer data
file = open('data/accel_mag_train.pkl', 'rb')
pickled = pickle.load(file)
accel_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['lists'])
print(accel_mag_train.shape)
file.close()

file = open('data/magneto_mag_train.pkl', 'rb')
pickled = pickle.load(file)
magneto_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['lists'])
print(magneto_mag_train.shape)
file.close()

file = open('data/gyro_mag_train.pkl', 'rb')
pickled = pickle.load(file)
gyro_mag_train =  pd.DataFrame(((x,) for x in pickled), columns=['lists'])
print(gyro_mag_train.shape)
file.close()

pos_labels  = []
path_labels = []
for label in data["labels"]:
    path_labels.extend([label["path_idx"]])
    pos_labels.extend([label["watch_loc"]])


temp_train =  pd.DataFrame(((x.values,) for x in data['temperature']), columns=['temp'])

(396, 1)
(396, 1)
(396, 1)


In [29]:
print(len(gyro_mag_train.loc[20][0]))

34928


In [5]:
#extract features from accelerometer norm

# features_acc = (features_extraction_common((accel_mag_train, "acc")))
# features_mag = (features_extraction_common((magneto_mag_train, "mag")))
# features_gyro = (features_extraction_common((gyro_mag_train, "gyro")))
# features_temp = (features_extraction_common((temp_train, "temp")))

features_to_calculate = ((accel_mag_train, "acc"), (magneto_mag_train, "mag"), (gyro_mag_train, "gyro"), (temp_train, "temp"))

with Pool(4) as p:
    feat = p.map(features_extraction_common, features_to_calculate)

features_acc = feat[0]
features_mag = feat[1]
features_gyro = feat[2]
features_temp = feat[3]

features_acc 



['Median_acc', 'Numneg_acc', 'Numpos_acc', 'Numabovmed_acc', 'Mean_acc', 'STD_acc', 'MAD_acc', 'Var_acc', 'Min_acc', 'Max_acc', 'SMA_acc', 'Energy_acc', 'IQR_acc', 'Entropy_acc', 'Npeaks_acc', 'avgprom_acc', 'avgpeakdist_acc', 'Sum_f_acc', 'Max_f_acc', 'NPeak_f_acc', 'Avgprom_f_acc', 'avgpeakdist_f_acc', 'Mean_f_acc', 'Skew_f_acc', 'Kurtosis_f_acc']
['Median_mag', 'Numneg_mag', 'Numpos_mag', 'Numabovmed_mag', 'Mean_mag', 'STD_mag', 'MAD_mag', 'Var_mag', 'Min_mag', 'Max_mag', 'SMA_mag', 'Energy_mag', 'IQR_mag', 'Entropy_mag', 'Npeaks_mag', 'avgprom_mag', 'avgpeakdist_mag', 'Sum_f_mag', 'Max_f_mag', 'NPeak_f_mag', 'Avgprom_f_mag', 'avgpeakdist_f_mag', 'Mean_f_mag', 'Skew_f_mag', 'Kurtosis_f_mag']
['Median_gyro', 'Numneg_gyro', 'Numpos_gyro', 'Numabovmed_gyro', 'Mean_gyro', 'STD_gyro', 'MAD_gyro', 'Var_gyro', 'Min_gyro', 'Max_gyro', 'SMA_gyro', 'Energy_gyro', 'IQR_gyro', 'Entropy_gyro', 'Npeaks_gyro', 'avgprom_gyro', 'avgpeakdist_gyro', 'Sum_f_gyro', 'Max_f_gyro', 'NPeak_f_gyro', 'Avgprom

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Median_acc,Numneg_acc,Numpos_acc,Numabovmed_acc,Mean_acc,STD_acc,MAD_acc,Var_acc,Min_acc,Max_acc,...,avgprom_acc,avgpeakdist_acc,Sum_f_acc,Max_f_acc,NPeak_f_acc,Avgprom_f_acc,avgpeakdist_f_acc,Mean_f_acc,Skew_f_acc,Kurtosis_f_acc
0,1.052621,0.0,117152.0,58575.0,1.094093,0.382390,0.253216,0.146222,0.181915,3.232912,...,39056.076798,12.137499,4.602765e+06,128175.176216,16825.0,39066.406059,6.962910,39.288826,292.528482,94772.449862
1,1.205261,0.0,103936.0,51968.0,1.188240,0.267160,0.176998,0.071375,0.030153,3.011230,...,36714.188174,16.744722,2.545490e+06,123500.945199,13948.0,34654.738882,7.451065,24.490941,301.110678,94833.454855
2,1.146574,0.0,118832.0,59416.0,1.234145,0.402890,0.158981,0.162321,0.057638,3.376399,...,40649.994438,14.374955,3.677856e+06,146655.877095,16737.0,39621.042934,7.100143,30.950048,298.807231,97905.405703
3,0.950258,0.0,103952.0,51976.0,1.034236,0.367639,0.258083,0.135158,0.054114,2.743708,...,34554.861599,11.767803,3.843132e+06,107510.902619,14802.0,34665.645775,7.022634,36.970258,274.435146,83641.366923
4,0.937741,0.0,125056.0,62528.0,1.009909,0.250131,0.128438,0.062566,0.373958,3.464102,...,41637.319668,17.564405,2.953011e+06,126295.184375,17913.0,41694.652480,6.981465,23.613507,325.385486,111851.867728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,1.684715,0.0,65120.0,32560.0,1.769343,0.827926,0.670524,0.685462,0.065090,3.464102,...,21533.262501,9.474320,4.963677e+06,115219.604826,9255.0,21735.331015,7.036309,76.223545,198.740420,45916.611855
392,1.050918,0.0,123824.0,61912.0,1.068815,0.256774,0.193767,0.065933,0.405239,2.162159,...,41979.426154,19.547679,2.780419e+06,132345.007735,17661.0,41283.289073,7.010759,22.454606,325.138525,111351.485185
393,0.993620,0.0,149552.0,74776.0,1.018351,0.243723,0.184148,0.059401,0.080493,3.202325,...,50793.548405,8.410372,5.951878e+06,152296.500853,21272.0,49866.918941,7.030511,39.798048,360.812282,136264.452909
394,1.063966,0.0,91200.0,45600.0,1.151451,0.434702,0.327359,0.188966,0.073827,2.870264,...,30794.274670,24.756992,2.291088e+06,105012.354689,12945.0,30409.774023,7.045117,25.121576,250.014576,70458.090515


In [9]:
#acc + mag yield the best results(87%), acc + mag + gyro 85%, mag + gyro 85%, acc + gyro  82.5%, 

features = pd.concat([features_acc,  features_gyro, features_temp], axis=1)


In [16]:
#split the training data because test data doesnt have labels
#why help the students even slightly?

pos_train_features, pos_test_features, pos_train_labels, pos_test_labels = train_test_split(
features, pos_labels, test_size=0.2, random_state=10)


# Instantiate model with 1000 decision trees, use all cores
rf_pos = RandomForestClassifier(n_estimators = 1000, random_state = 42, n_jobs=-1)
# Train the model on training data
rf_pos.fit(pos_train_features, pos_train_labels)



In [19]:
def evaluate_pos(model, test_features, test_labels):
    
    y_pred = model.predict(test_features)

    #cast to int because indices cant be floats
    watch_pos_pred = np.rint(y_pred)

    num_tests = len(watch_pos_pred)

    watch_pos_mismatches = np.count_nonzero(watch_pos_pred != test_labels)

    print(f"{num_tests} tests, {watch_pos_mismatches} watch pos mismatches {(1-(watch_pos_mismatches/num_tests))*100} accuracy")
    return (1-(watch_pos_mismatches/num_tests))*100

# scaler = StandardScaler()
# X_val = scaler.fit_transform(pos_test_features) 


accuracy = evaluate_pos(rf_pos,pos_test_features,pos_test_labels)
joblib.dump(rf_pos, f"group32_model_watchpos_acc{accuracy}.pkl") 



80 tests, 2 watch pos mismatches 97.5 accuracy


['submission_models/group32_model_watchpos_acc97.5.pkl']

In [None]:
def test_sensor_combinations():
    for i in range(1,16):
        feat = []
        print(i)
        print("[", end='')
        if(i&1):
            feat.append(features_acc)
            print("acc, ", end='')
        if(i&(1<<1)):
            feat.append(features_mag)
            print("mag, ", end='')
        if(i&(1<<2)):
            feat.append(features_gyro)
            print("gyro, ", end='')
        if(i&(1<<3)):
            feat.append(features_temp)
            print("temp, ", end='')
        print("]")
        features = pd.concat(feat, axis=1)

        pos_train_features, pos_test_features, pos_train_labels, pos_test_labels = train_test_split(
        features, pos_labels, test_size=0.2, random_state=10)

        scaler = StandardScaler()
        X_train_pos = scaler.fit_transform(pos_train_features) 

        # Instantiate model with 1000 decision trees, use all cores
        rf_pos = RandomForestRegressor(n_estimators = 1000, random_state = 42, n_jobs=-1)
        # Train the model on training data
        rf_pos.fit(X_train_pos, pos_train_labels)

        X_val = scaler.fit_transform(pos_test_features) 
        evaluate_pos(rf_pos,X_val,pos_test_labels)

test_sensor_combinations()