## Import libraries

In [1]:
import numpy as np
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from scipy.interpolate import splrep, splev, splprep

## Read data

In [3]:
data_uschad = pd.read_table("../data/USC-HAD/USC-HAD_cleared.txt", delimiter=',', header=None)
data_uschad.columns = ['id_user', 'activity', 'timestamp', 'x', 'y', 'z']

data_wisdm = pd.read_table("../data/WISDM/WISDM_ar_v1.1_raw_cleared.txt", delimiter=',', header=None)
data_wisdm.columns = ['id_user', 'activity', 'timestamp', 'x', 'y', 'z']

## Creating object-feature matrix

So we need to construct 10 seconds time series. To do it we need to remember the following:
* each time series should be from one user and one type of activity;
* in the time series timestamp should't differ more then 0.2 second (empirical rule, in ideal all timestamp should differ on 50 ms = 0.05 second).

Now let's create **object-feature** matrix:

In [4]:
def check_candidate(candidate, data_type, threshold=2.*1e8):
    if data_type == "USCHAD":
        threshold = 0.
    tsp = np.array(candidate['timestamp'])
    diffs = tsp[1:] - tsp[:-1]
    
    return np.sum(diffs > threshold) == 0

def get_time_series(accelerations, data_type, nb=200):
    accelerations.index = [i for i in range(len(accelerations))]
    TS = []
    st = 0
    fi = st + nb
    while fi < len(accelerations):
        candidate = accelerations.loc[[st + i for i in range(nb)], :]
        if check_candidate(candidate, data_type):
            TS.append([np.array(candidate['x']), 
                       np.array(candidate['y']), 
                       np.array(candidate['z'])])
        st = fi
        fi += nb
    
    return TS

In [5]:
def get_distribution(data, df):
    classes = list(set(data['activity']))
    for activity in classes:
        nb = np.sum(df['activity'] == classes.index(activity))
        print("{:<20}{:<9d}{:<5.2f} %".format(activity, nb, 100. * nb / df.shape[0]))
    print("")
    print("Number of objects: {:d}".format(df.shape[0]))

In [6]:
def get_feature_matrix(data, data_type, get_feature_names, get_features, params=[]):
    
    classes = list(set(data['activity']))
    feature_names = get_feature_names(params)
    df = pd.DataFrame(columns=['activity']+feature_names) 

    id_range = np.unique(np.array(data['id_user']))
    for id_user in id_range:
        for activity in classes:
            mask = (data.loc[:, 'id_user'] == id_user) & (data.loc[:, 'activity'] == activity)
            accelerations = data.loc[mask, ['timestamp', 'x', 'y', 'z']].copy()
            TS = get_time_series(accelerations, data_type, nb=200)
            for ts in TS:
                features = get_features(ts, params)
                df.loc[len(df), :] = [classes.index(activity)] + features
    return df

## Feature extraction

### Expert functions

The idea is the following: we will consider 10 seconds time series (or 200 points of measurements) and calculate 40 features:
* ```[3]``` - mean acceleration of each axis;
* ```[3]``` - std of acceleration of each axis;
* ```[3]``` - mean absolute deviation of acceleration of each axis;
* ```[1]``` - mean acceleration;
* ```[30]``` - distribution of time series values of each axis. First of all we calculate min and max of each component ($X, Y, Z$) from the whole interval. Then we divide the range of values of each component into 10 equal intervals and calculate on each each interval the percent of values that are in it (in the corresponding interval).  

And apply LogisticRegression and SVM.

In [7]:
def get_expert_names(params):
    feature_names = ['avg_x', 'avg_y', 'avg_z', 
                     'std_x', 'std_y', 'std_z', 
                     'abs_x', 'abs_y', 'abs_z', 'mean']
    for i in range(10):
        name = str(i) + '_'
        feature_names += [name + 'x', name + 'y', name + 'z']
        
    return feature_names

def get_expert_features(ts, params):
    x = ts[0]
    y = ts[1]
    z = ts[2]
    n = x.shape[0]
    features = []
    features.append(x.mean())
    features.append(y.mean())
    features.append(z.mean())
    features.append(x.std())
    features.append(y.std())
    features.append(z.std())
    features.append(np.abs(x - x.mean()).mean())
    features.append(np.abs(y - y.mean()).mean())
    features.append(np.abs(z - z.mean()).mean())
    features.append((x+y+z).mean() / 3.)
    x_range = np.linspace(x.min(), x.max(), 11)
    y_range = np.linspace(y.min(), y.max(), 11)
    z_range = np.linspace(z.min(), z.max(), 11)
    for i in range(10):
        features.append(1. * np.sum((x_range[i] <= x) & (x < x_range[i+1])) / n)
        features.append(1. * np.sum((y_range[i] <= y) & (y < y_range[i+1])) / n)
        features.append(1. * np.sum((z_range[i] <= z) & (z < z_range[i+1])) / n)
    
    return features

Create and save:

In [12]:
df_expert_wisdm = get_feature_matrix(data_wisdm, 'WISDM', get_expert_names, get_expert_features)
df_expert_wisdm.to_csv("../data/features/expert_wisdm.csv", index=False)

In [17]:
df_expert_uschad = get_feature_matrix(data_uschad, 'USCHAD', get_expert_names, get_expert_features)
df_expert_uschad.to_csv("../data/features/expert_uschad.csv", index=False)

### Autoregression model

In [18]:
def get_autoregressive_names(params):
    n = params[0]
    feature_names = []
    for ax in ['x', 'y', 'z']:
        feature_names += ['intercept_' + ax]
        for i in range(n):
            feature_names += ['coef_' + str(i) + '_' + ax]
            
    return feature_names

def get_autoregressive_features(ts, params):
    n = params[0]
    x = ts[0]
    y = ts[1]
    z = ts[2]
    m = x.shape[0]
    features = []
    X = np.zeros([m-n, n])
    Y = np.zeros(m-n)
    for axis in [x, y, z]:
        for i in range(m-n):
            X[i, :] = axis[i:i+n]
            Y[i] = axis[i+n]
        lr = LinearRegression()
        lr.fit(X, Y)
        features.append(lr.intercept_)
        features.extend(lr.coef_)
    
    return features

Create and save:

In [20]:
params = [20]
df_ar_wisdm = get_feature_matrix(data_wisdm, 'WISDM', get_autoregressive_names,
                                 get_autoregressive_features, params)
df_ar_wisdm.to_csv("../data/features/ar_wisdm.csv", index=False)

In [21]:
params = [20]
df_ar_uschad = get_feature_matrix(data_uschad, 'USCHAD', get_autoregressive_names,
                                  get_autoregressive_features, params)
df_ar_uschad.to_csv("../data/features/ar_uschad.csv", index=False)

### Spectrum analysis

In [22]:
def get_spectrum_names(params):
    n = params[0]
    feature_names = []
    for ax in ['x', 'y', 'z']:
        for i in range(n):
            feature_names += ['eigv_' + str(i) + '_' + ax]
            
    return feature_names

def get_spectrum_features(ts, params):
    n = params[0]
    x = ts[0]
    y = ts[1]
    z = ts[2]
    m = x.shape[0]
    features = []
    X = np.zeros([m-n, n])
    Y = np.zeros(m-n)
    for axis in [x, y, z]:
        for i in range(m-n):
            X[i, :] = axis[i:i+n]
        h = sc.linalg.svd(X.T.dot(X), compute_uv=False, overwrite_a=True)
        features.extend(h)
    
    return features

Create and save:

In [23]:
params = [20]
df_ssa_wisdm = get_feature_matrix(data_wisdm, 'WISDM', get_spectrum_names,
                                  get_spectrum_features, params)
df_ssa_wisdm.to_csv("../data/features/ssa_wisdm.csv", index=False)

In [24]:
params = [20]
df_ssa_uschad = get_feature_matrix(data_uschad, 'USCHAD', get_spectrum_names,
                                   get_spectrum_features, params)
df_ssa_uschad.to_csv("../data/features/ssa_uschad.csv", index=False)

### Splines

In [7]:
def get_spline_names(params):
    n = params[0]
    feature_names = []
    for ax in ['x', 'y', 'z']:
        for i in range(n):
            feature_names += ['coef_' + str(i) + '_' + ax]
            
    return feature_names

def get_spline(t, ts, n):
    s_down = 1e-6
    s_up = 1000.
    spl = splrep(t, ts, s=s_up)
    while len(spl[1]) >= n:
        spl = splrep(t, ts, s=s_up)
        s_up *= 2.
    max_iter = int(np.floor(np.log2(s_up * 1e4)))
    num_iter = 0
    while (len(spl[1]) != n) and (num_iter <= max_iter):
        s = (s_up + s_down) / 2.
        spl = splrep(t, ts, s=s)
        if len(spl[1]) < n:
            s_up = s
        else:
            s_down = s
        num_iter += 1
        if num_iter > max_iter:
            spl = splrep(t, ts, s=s_down)
            
    return spl[1][:n]

def get_spline_features(ts, params):
    n = params[0]
    x = ts[0]
    y = ts[1]
    z = ts[2]
    m = x.shape[0]
    features = []
    t = np.arange(0, m, 1)
    spl_x = get_spline(t, x, n)
    spl_y = get_spline(t, y, n)
    spl_z = get_spline(t, z, n)
    features = list(np.concatenate((spl_x, spl_y, spl_z), axis=0))
    
    return features

Create and save:

In [8]:
params = [11]
df_ssa_wisdm = get_feature_matrix(data_wisdm, 'WISDM', get_spline_names,
                                      get_spline_features, params)
df_ssa_wisdm.to_csv("../data/features/spl_wisdm_11.csv", index=False)

spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)


In [9]:
params = [11]
df_ssa_uschad = get_feature_matrix(data_uschad, 'USCHAD', get_spline_names,
                                   get_spline_features, params)
df_ssa_uschad.to_csv("../data/features/spl_uschad_11.csv", index=False)

## How to read: 

In [17]:
spl_wisdm = pd.read_csv("../data/features/spl_wisdm.csv")

In [18]:
spl_wisdm.head()

Unnamed: 0,activity,coef_0_x,coef_1_x,coef_2_x,coef_3_x,coef_4_x,coef_5_x,coef_6_x,coef_7_x,coef_8_x,...,coef_40_z,coef_41_z,coef_42_z,coef_43_z,coef_44_z,coef_45_z,coef_46_z,coef_47_z,coef_48_z,coef_49_z
0,1,2.845102,-2.852912,9.180071,-1.401187,11.73803,-2.519118,6.574491,-4.031136,17.467649,...,3.589595,-3.117073,-1.235503,2.469804,-4.359638,-1.30658,0.0,0.0,0.0,0.0
1,1,3.09467,14.439397,-2.141248,3.83286,3.776352,-0.98007,13.237937,1.720668,-1.356833,...,3.857017,-3.336601,-0.896229,5.927022,-0.50525,-6.617887,0.0,0.0,0.0,0.0
2,1,1.436514,5.276655,9.185416,-0.509384,0.480894,11.206549,-1.580749,4.972559,-1.647947,...,-2.265185,-0.292136,2.400569,-8.215157,7.105995,-3.799797,0.0,0.0,0.0,0.0
3,1,-1.996452,15.720589,-3.774008,5.366599,-3.722388,14.630739,-2.882463,3.645236,0.05724,...,0.387069,-1.874479,1.041068,-1.94978,0.403705,-3.741833,0.0,0.0,0.0,0.0
4,1,1.107837,14.744332,-5.974299,5.787027,-3.764102,11.344725,3.18756,-1.79995,4.77789,...,-5.546385,3.404281,-0.832806,-3.615928,1.860831,-0.302996,0.0,0.0,0.0,0.0
