In [15]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import sys
sys.path.insert(0, os.path.abspath('../..'))
from utils import tutorial_pamap2

# WISDM actitracker data set
This data set is downloaded from http://www.cis.fordham.edu/wisdm/dataset.php

In [2]:
# datapath = '/media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/'
# dfile = os.path.join(datapath, 'WISDM_at_v2.0_raw.txt')
datapath = '/media/sf_VBox_Shared/timeseries/actitiracker/WISDM_ar_v1.1/'
dfile = os.path.join(datapath, 'WISDM_ar_v1.1_raw_1.txt')

In [3]:
column_names = ['user','activity','timestamp', 'x-acc', 'y-acc', 'z-acc']
df_full = pd.read_csv(dfile, header=None, sep=',', names=column_names, na_values=';')

df_full['z-acc'] = [float(str(s).split(';')[0]) for s in df_full['z-acc']]
df_full = df_full.dropna()

#df_full['timestamp'] = df_full['timestamp'].astype('int')
#df_full = df_full[df_full['timestamp']>=0]
#df_full = df_full[df_full['timestamp']<=9e12]

In [4]:
df_full['datetime'] = pd.to_datetime(df_full.timestamp, unit='ns', errors='coerce')

In [5]:
df_full = df_full.sort_values(['user', 'timestamp'])

In [6]:
df_full.head()

Unnamed: 0,user,activity,timestamp,x-acc,y-acc,z-acc,datetime
941960,1,Walking,4991922345000,0.69,10.8,-2.03,1970-01-01 01:23:11.922345
941961,1,Walking,4991972333000,6.85,7.44,-0.5,1970-01-01 01:23:11.972333
941962,1,Walking,4992022351000,0.93,5.63,-0.5,1970-01-01 01:23:12.022351
941963,1,Walking,4992072339000,-2.11,5.01,-0.69,1970-01-01 01:23:12.072339
941964,1,Walking,4992122358000,-4.59,4.29,-1.95,1970-01-01 01:23:12.122358


In [7]:
df_full.shape

(1098203, 7)

In [8]:
df_full['activity'].unique()

array(['Walking', 'Jogging', 'Upstairs', 'Downstairs', 'Sitting',
       'Standing'], dtype=object)

In [9]:
df_full.describe()

Unnamed: 0,user,timestamp,x-acc,y-acc,z-acc
count,1098203.0,1098203.0,1098203.0,1098203.0,1098203.0
mean,18.86067,33409100000000.0,0.6628645,7.255642,0.4110616
std,10.21423,49449680000000.0,6.84906,6.746207,4.754109
min,1.0,0.0,-19.61,-19.61,-19.8
25%,10.0,2019128000000.0,-2.87,3.17,-2.22
50%,19.0,9722802000000.0,0.27,7.93,0.0
75%,28.0,49965720000000.0,4.44,11.56,2.72
max,36.0,209397400000000.0,19.95,20.04,19.61


In [10]:
df_full['user'].nunique()

36

In [11]:
# A new block of data starts with a new user, or a leap in the time step
df_full['timediff'] = df_full['datetime'].diff()
df_full['newblock'] = False
df_full['newuser'] = False
df_full.loc[df_full['timediff'] > pd.Timedelta('1s'), 'newblock'] = True
df_full.loc[df_full['timediff'] < pd.Timedelta('20ms'), 'newblock'] = True
df_full.loc[df_full['user'].diff()!=0, 'newuser'] = True

In [12]:
# How many strange leaps do we have?
df_full['newblock'].sum()

67630

In [13]:
# examples of leaps
df_full[df_full['newblock'] & ~df_full['newuser']].head()

Unnamed: 0,user,activity,timestamp,x-acc,y-acc,z-acc,datetime,timediff,newblock,newuser
942094,1,Walking,4998592298000,-0.99,15.75,-1.23,1970-01-01 01:23:18.592298,0 days,True,False
942143,1,Walking,5001062177000,9.19,10.08,-0.08,1970-01-01 01:23:21.062177,0 days,True,False
942243,1,Walking,5006032331000,3.95,5.52,-1.57,1970-01-01 01:23:26.032331,0 days,True,False
942293,1,Walking,5008562146000,1.08,1.08,-1.33,1970-01-01 01:23:28.562146,0 days,True,False
942527,1,Walking,5020242292000,8.28,11.41,0.0,1970-01-01 01:23:40.242292,0 days,True,False


In [16]:
X_dict = {}
for user in df_full['user'].unique():
    X_df =  df_full[df_full['user']==user]
    X = X_df[['x-acc', 'y-acc', 'z-acc']].as_matrix()
    labels = X_df['activity'].as_matrix()
    Xlist, ylist = tutorial_pamap2.split_activities(labels, 
                    X,
                    [], 
                    borders=0)
    X_dict[user] = (Xlist, ylist)

In [17]:
frame_length = 10 * 50 # 10 seconds
step = 10 * 50 # 1 second

sample_dict = {}
for user in X_dict:
    Xlist, ylist = X_dict[user]
    X_sample_list, y_sample_list = tutorial_pamap2.sliding_window(frame_length, step, Xlist, ylist)
    if len(X_sample_list) > 0:
        X = np.array(X_sample_list)
        y = np.array(y_sample_list)
        sample_dict[user] = X, y

In [18]:
userids = np.array(list(sample_dict.keys()))
nr_users = len(userids)
nr_users_test = int(nr_users*0.1)
nr_users_val = int(nr_users*0.1)

In [19]:
neworder = np.random.permutation(nr_users)
userids = userids[neworder]

train_userids = userids[:-(nr_users_test+nr_users_val)]
test_userids = userids[-(nr_users_test+nr_users_val):-nr_users_val]
val_userids = userids[-nr_users_val:]
print('train: {}, test: {}, val: {}'.format(len(train_userids), len(test_userids), len(val_userids)))

train: 30, test: 3, val: 3


In [20]:
X_train = np.concatenate([sample_dict[userid][0] for userid in train_userids])
y_train = np.concatenate([sample_dict[userid][1] for userid in train_userids])
X_test = np.concatenate([sample_dict[userid][0] for userid in test_userids])
y_test = np.concatenate([sample_dict[userid][1] for userid in test_userids])
X_val = np.concatenate([sample_dict[userid][0] for userid in val_userids])
y_val = np.concatenate([sample_dict[userid][1] for userid in val_userids])

In [21]:
labels = list(df_full['activity'].unique().astype('unicode'))
mapclasses = {labels[i]: i for i in range(len(labels))}

y_train_binary = tutorial_pamap2.transform_y(y_train, mapclasses, len(labels))
y_test_binary = tutorial_pamap2.transform_y(y_test, mapclasses, len(labels))
y_val_binary = tutorial_pamap2.transform_y(y_val, mapclasses, len(labels))

In [22]:
X_train.shape, y_train_binary.shape, X_test.shape, y_test_binary.shape, X_val.shape, y_val_binary.shape

((1724, 500, 3), (1724, 6), (118, 500, 3), (118, 6), (160, 500, 3), (160, 6))

In [23]:
X_train.shape[0] + X_test.shape[0] + X_val.shape[0]

2002

In [None]:
import json

outdatapath = os.path.join(datapath,'preprocessed')

tutorial_pamap2.numpify_and_store(X_train, y_train_binary, 'X_train', 'y_train', outdatapath, shuffle=True)
tutorial_pamap2.numpify_and_store(X_test, y_test_binary, 'X_test', 'y_test', outdatapath, shuffle=True)
tutorial_pamap2.numpify_and_store(X_val, y_val_binary, 'X_val', 'y_val', outdatapath, shuffle=True)

In [None]:
with open(os.path.join(outdatapath, 'labels.json'), 'w') as fp:
    json.dump(labels, fp)