In [104]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.insert(0, os.path.abspath('../..'))
from utils import tutorial_pamap2

Using TensorFlow backend.


# WISDM actitracker data set
This data set is downloaded from http://www.cis.fordham.edu/wisdm/dataset.php

In [5]:
datapath = '/media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/'
dfile = os.path.join(datapath, 'WISDM_at_v2.0_raw.txt')

In [74]:
column_names = ['user','activity','timestamp', 'x-acc', 'y-acc', 'z-acc']
df_full = pd.read_csv(dfile, header=None, sep=',', names=column_names, na_values=';')

df_full['z-acc'] = [float(str(s).split(';')[0]) for s in df_full['z-acc']]
df_full = df_full.dropna()

#df_full['timestamp'] = df_full['timestamp'].astype('int')
df_full = df_full[df_full['timestamp']>=0]
df_full = df_full[df_full['timestamp']<=2e18]

df_full['datetime'] = pd.to_datetime(df_full.timestamp, unit='ms')

In [75]:
df_full.head()

Unnamed: 0,user,activity,timestamp,x-acc,y-acc,z-acc,datetime
0,1679,Walking,1370520000000.0,0.294132,-0.635605,-0.226936,2013-06-06 12:07:49.556
1,1679,Walking,1370520000000.0,-0.499688,-0.604451,-0.22602,2013-06-06 12:07:49.606
2,1679,Walking,1370520000000.0,-2.178345,0.713491,0.372017,2013-06-06 12:07:49.656
3,1679,Walking,1370520000000.0,-2.797763,1.354899,-0.277638,2013-06-06 12:07:49.706
4,1679,Walking,1370520000000.0,-2.167961,-1.327716,-0.554971,2013-06-06 12:07:49.756


In [69]:
df_full.shape

(2980763, 6)

In [112]:
df_full['activity'].unique()

array(['Walking', 'LyingDown', 'Standing', 'Sitting', 'Jogging', 'Stairs'], dtype=object)

In [76]:
df_full.describe()

Unnamed: 0,user,timestamp,x-acc,y-acc,z-acc
count,2980763.0,2980763.0,2980763.0,2980763.0,2980763.0
mean,1025.268,1287680000000.0,48448560000.0,-137368.1,0.6093973
std,490.1956,81690660000.0,83645890000000.0,237169800.0,4.564471
min,194.0,289407000000.0,-46.28739,-409471000000.0,-48.20949
25%,634.0,1200001000000.0,-2.230362,-0.27,-1.225831
50%,705.0,1317926000000.0,-0.002130529,1.65,0.01160644
75%,1603.0,1372744000000.0,2.508921,8.85,2.47641
max,1802.0,1379603000000.0,1.444137e+17,80.13994,65.90069


In [77]:
df_full['user'].nunique()

225

In [94]:
# A new block of data starts with a new user, or a leap in the time step
df_full['newblock'] = False
df_full.loc[df_full['datetime'].diff() != pd.Timedelta('50ms'), 'newblock'] = True
df_full.loc[df_full['user'].diff()!=0, 'newblock'] = True

In [95]:
df_full['newblock'].sum()

557298

In [117]:
X_dict = {}
for user in df_full['user'].unique():
    X_df =  df_full[df_full['user']==user]
    X = X_df[['x-acc', 'y-acc', 'z-acc']].as_matrix()
    labels = X_df['activity'].as_matrix()
    Xlist, ylist = tutorial_pamap2.split_activities(labels, 
                    X,
                    [], 
                    borders=5 * 50)
    X_dict[user] = (Xlist, ylist)

In [145]:
frame_length = 5 * 50 # 5 seconds
step = 1 * 50 # 1 second

sample_dict = {}
for user in X_dict:
    Xlist, ylist = X_dict[user]
    X_sample_list, y_sample_list = tutorial_pamap2.sliding_window(frame_length, step, Xlist, ylist)
    if len(X_sample_list) > 0:
        X = np.array(X_sample_list)
        y = np.array(y_sample_list)
        sample_dict[user] = X, y

In [192]:
userids = np.array(list(sample_dict.keys()))
nr_users = len(userids)
nr_users_test = int(nr_users*0.1)
nr_users_val = int(nr_users*0.1)

In [193]:
neworder = np.random.permutation(nr_users)
userids = userids[neworder]

train_userids = userids[:-(nr_users_test+nr_users_val)]
test_userids = userids[-(nr_users_test+nr_users_val):-nr_users_val]
val_userids = userids[-nr_users_val:]
print('train: {}, test: {}, val: {}'.format(len(train_userids), len(test_userids), len(val_userids)))

train: 163, test: 20, val: 20


In [194]:
X_train = np.concatenate([sample_dict[userid][0] for userid in train_userids])
y_train = np.concatenate([sample_dict[userid][1] for userid in train_userids])
X_test = np.concatenate([sample_dict[userid][0] for userid in test_userids])
y_test = np.concatenate([sample_dict[userid][1] for userid in test_userids])
X_val = np.concatenate([sample_dict[userid][0] for userid in val_userids])
y_val = np.concatenate([sample_dict[userid][1] for userid in val_userids])

In [195]:
labels = list(df_full['activity'].unique().astype('unicode'))
mapclasses = {labels[i]: i for i in range(len(labels))}

y_train_binary = tutorial_pamap2.transform_y(y_train, mapclasses, len(labels))
y_test_binary = tutorial_pamap2.transform_y(y_test, mapclasses, len(labels))
y_val_binary = tutorial_pamap2.transform_y(y_val, mapclasses, len(labels))

In [196]:
X_train.shape, y_train_binary.shape, X_test.shape, y_test_binary.shape, X_val.shape, y_val_binary.shape

((46386, 250, 3),
 (46386, 6),
 (1641, 250, 3),
 (1641, 6),
 (3828, 250, 3),
 (3828, 6))

In [199]:
import json

outdatapath = '/media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/preprocessed'

tutorial_pamap2.numpify_and_store(X_train, y_train_binary, 'X_train', 'y_train', outdatapath, shuffle=True)
tutorial_pamap2.numpify_and_store(X_test, y_test_binary, 'X_test', 'y_test', outdatapath, shuffle=True)
tutorial_pamap2.numpify_and_store(X_val, y_val_binary, 'X_val', 'y_val', outdatapath, shuffle=True)

Stored /media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/preprocessed/X_train y_train
Stored /media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/preprocessed/X_test y_test
Stored /media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/preprocessed/X_val y_val


In [198]:
with open(os.path.join(outdatapath, 'labels.json'), 'w') as fp:
    json.dump(labels, fp)