# PAMAP2 dataset

In [108]:
import numpy as np
import pandas as pd
from os import listdir
from numpy import genfromtxt
import matplotlib.pyplot as plt

In [109]:
%matplotlib inline

In [110]:
#import urllib.request
#url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00231/PAMAP2_Dataset.zip'
#local_fn, headers = urllib.request.urlretrieve(url)

In [111]:
datadir = "/media/sf_VBox_Shared/timeseries/PAMAP2_Dataset/Protocol"
filenames = listdir(datadir)

axes = ['x', 'y', 'z']
IMUsensor_columns = ['temperature'] + \
                    ['acc_16g_' + i for i in axes] + \
                    ['acc_6g_' + i for i in axes] + \
                    ['gyroscope_'+ i for i in axes] + \
                    ['magnometer_'+ i for i in axes] + \
                    ['orientation_' + str(i) for i in range(4)]
header = ["timestamp", "activityID", "heartrate"] + ["hand_"+s for s in IMUsensor_columns]\
        + ["chest_"+s for s in IMUsensor_columns]+ ["ankle_"+s for s in IMUsensor_columns]
datasets = [pd.read_csv(datadir+'/'+fn, header=None, sep=' ') for fn in filenames]
for data in datasets:
    data.columns = header

data = datasets[0]    
print(data.shape)
print(header)

(376417, 54)
['timestamp', 'activityID', 'heartrate', 'hand_temperature', 'hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z', 'hand_acc_6g_x', 'hand_acc_6g_y', 'hand_acc_6g_z', 'hand_gyroscope_x', 'hand_gyroscope_y', 'hand_gyroscope_z', 'hand_magnometer_x', 'hand_magnometer_y', 'hand_magnometer_z', 'hand_orientation_0', 'hand_orientation_1', 'hand_orientation_2', 'hand_orientation_3', 'chest_temperature', 'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z', 'chest_acc_6g_x', 'chest_acc_6g_y', 'chest_acc_6g_z', 'chest_gyroscope_x', 'chest_gyroscope_y', 'chest_gyroscope_z', 'chest_magnometer_x', 'chest_magnometer_y', 'chest_magnometer_z', 'chest_orientation_0', 'chest_orientation_1', 'chest_orientation_2', 'chest_orientation_3', 'ankle_temperature', 'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z', 'ankle_acc_6g_x', 'ankle_acc_6g_y', 'ankle_acc_6g_z', 'ankle_gyroscope_x', 'ankle_gyroscope_y', 'ankle_gyroscope_z', 'ankle_magnometer_x', 'ankle_magnometer_y', 'ankle_magnometer_

In [112]:
# we need to exclude activity=0

# Peprocess data

In [113]:
#Fill data
datasets_filled = [d.interpolate() for d in datasets]

In [114]:
# Select columns
columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
                 'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
                 'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
columns = data.columns[2:]

In [115]:
# Create mapping for class labels
from keras.utils.np_utils import to_categorical
y_set_all = [set(np.array(data.activityID)) - set([0]) for data in datasets_filled]
classlabels = list(set.union(*[set(y) for y in y_set_all]))
nr_classes = len(classlabels)
mapclasses = {classlabels[i] : i for i in range(len(classlabels))}
def transform_y(y):
    y_mapped = np.array([mapclasses[c] for c in y], dtype='int')
    y_binary = to_categorical(y_mapped, nr_classes)
    return y_binary

## Isolate activities

In [116]:

def split_activities(labels, X, borders=10*100):
    """
    Splits up the data per activity and exclude activity=0.
    Also remove borders for each activity.
    Returns lists with subdatasets
    """
    tot_len = len(labels)
    startpoints = np.where([1] + [labels[i]!=labels[i-1] for i in range(1, tot_len)])[0]
    endpoints = np.append(startpoints[1:]-1, tot_len-1)
    acts = [labels[s] for s,e in zip(startpoints, endpoints)]
    #Also split up the data, and only keep the non-zero activities
    Xy_split = [(X[s+borders:e-borders+1,:], a) for s,e,a in zip(startpoints, endpoints, acts) if a != 0]
    Xy_split = [(X, y) for X,y in Xy_split if len(X)>0]
    X_list = [X for X,y in Xy_split]
    y_list = [y for X,y in Xy_split]
    return X_list, y_list

In [117]:
#Create input (X) and output (y) sets
X_all = [np.array(data[columns_to_use]) for data in datasets_filled]
y_all = [np.array(data.activityID) for data in datasets_filled]
Xy_lists = [split_activities(y, X) for X,y in zip(X_all, y_all)]
X_lists, y_lists = zip(*Xy_lists)
y_binary_lists = [transform_y(y) for y in y_lists]
print(len(X_lists[0]), len(y_lists[0]), len(y_binary_lists[0]))

14 14 14


In [153]:
X_all[0].shape

(376417, 9)

In [152]:
for X in X_lists:
    print([Xs.shape[0] for Xs in X])

[25187, 21480, 19717, 21573, 20941, 6120, 5480, 5770, 5419, 20253, 18265, 21575, 19265, 10912]
[21430, 20345, 23576, 26880, 18683, 6725, 5791, 6617, 5422, 30533, 27739, 23108, 7238, 11262]
[20044, 26761, 18533, 25975, 18325, 3764, 3435, 2867, 2954, 2644, 27036]
[21047, 23492, 22706, 22995, 18037, 6535, 5328, 6159, 4957, 29932, 25533, 20699]
[21699, 24864, 20132, 31034, 22445, 4110, 4982, 6171, 3745, 30033, 24271, 22577, 22646, 5733]
[21340, 21041, 22356, 35744, 19078, 4232, 4173, 5059, 3099, 23721, 24686, 18486, 20825]
[23611, 10282, 23751, 27499, 19552, 7102, 4290, 6544, 3328, 31720, 26725, 20680, 1692]
[22165, 20923, 23160, 30990, 22292, 3901, 2846, 3782, 2809, 29533, 23475, 26888, 14532, 6806]
[4391]


## Split in train, test and val

In [154]:
print(len(datasets_filled))
train_range = slice(0, 6)
val_range = 6
test_range = slice(7,len(datasets_filled))

9


In [161]:
X_train_list = [X for X_list in X_lists[train_range] for X in X_list]
X_val_list = [X for X in X_lists[val_range]]
X_test_list = [X for X_list in X_lists[test_range] for X in X_list]

y_train_list = [y for y_list in y_binary_lists[train_range] for y in y_list]
y_val_list = [y for y in y_binary_lists[val_range]]
y_test_list = [y for y_list in y_binary_lists[test_range] for y in y_list]

In [162]:
print(len(X_train_list), X_train_list[0].shape)
print(len(y_train_list), y_train_list[0].shape)
print(len(X_val_list), X_val_list[0].shape)
print(len(y_val_list), y_val_list[0].shape)

78 (25187, 9)
78 (12,)
13 (23611, 9)
13 (12,)


## Construct batches

In [163]:
# Take sliding-window frames. Target is label of last time step
# Data is 100 Hz
frame_length = int(5.12 * 100)
step = 1 * 100

def sliding_window(X, y_binary, frame_length, step, X_samples, y_samples):
    for i in range(0, X.shape[0]-frame_length, step):
        X_sub = X[i:i+frame_length,:]
        y_sub = y_binary
        X_samples.append(X_sub)
        y_samples.append(y_sub)

X_train = []
y_train = []
X_val = []
y_val = []
X_test = []
y_test = []
for j in range(len(X_train_list)):
    X = X_train_list[j]
    y_binary = y_train_list[j]
    sliding_window(X, y_binary, frame_length, step, X_train, y_train)
for j in range(len(X_val_list)):
    X = X_val_list[j]
    y_binary = y_val_list[j]
    sliding_window(X, y_binary, frame_length, step, X_val, y_val)
for j in range(len(X_test_list)):
    X = X_test_list[j]
    y_binary = y_test_list[j]
    sliding_window(X, y_binary, frame_length, step, X_test, y_test)
    
        
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)
X_test = np.array(X_test)
y_test = np.array(y_test)


#Shuffle around the train set
np.random.seed(123)
neworder = np.random.permutation(X_train.shape[0])
X_train = X_train[neworder,:,:]
y_train = y_train[neworder,:]

In [164]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)
#Test and validation 

(12497, 512, 9) (12497, 12)
(2007, 512, 9) (2007, 12)
(2314, 512, 9) (2314, 12)


## Save the results

In [165]:
# Save binary file
outdatapath = '/media/sf_VBox_Shared/timeseries/PAMAP2_Dataset/slidingwindow512cleaned/'
np.save(outdatapath+'X_train', X_train)
np.save(outdatapath+'y_train_binary', y_train)
np.save(outdatapath+'X_val', X_val)
np.save(outdatapath+'y_val_binary', y_val)
np.save(outdatapath+'X_test', X_test)
np.save(outdatapath+'y_test_binary', y_test)

In [166]:
np.where(X_train == np.nan)

(array([], dtype=int64), array([], dtype=int64), array([], dtype=int64))

In [167]:
X_train.max()

156.19

In [168]:
for X in X_all:
    print(X.max())

155.96
155.996
156.19
86.7261
156.73
156.949
157.232
158.872
160.516


In [169]:
pd.DataFrame.interpolate?