## Import libraries

In [17]:
import numpy as np
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.pipeline import Pipeline

## Clear Data

In [19]:
f_read = open("../data/WISDM/WISDM_ar_v1.1_raw.txt", "r")
f_write = open("../data/WISDM/WISDM_ar_v1.1_raw_cleared.txt", "w")
for string in f_read.readlines():
    if string[-1] == '\n':
        string = string[:-1]
    if len(string) > 0:
        string_list = string.split(';')
        if len(string_list) > 2:
            for row in string_list[:2]:
                words = row.split(',')
                if len(words) > 5:
                    if len(words[5]) > 0:
                        f_write.write("%s,%s,%s,%s,%s,%s\n" % (words[0], words[1], 
                                                               words[2], words[3], 
                                                               words[4], words[5]))
        else:
            words = string_list[0].split(',')
            if len(words) > 5:
                if len(words[5]) > 0:
                    f_write.write("%s,%s,%s,%s,%s,%s\n" % (words[0], words[1], 
                                                           words[2], words[3], 
                                                           words[4], words[5]))
f_read.close()
f_write.close()

## Read Data

In [20]:
data_wisdm = pd.read_table("../data/WISDM/WISDM_ar_v1.1_raw_cleared.txt", delimiter=',', header=None)
data_wisdm.columns = ['id_user', 'activity', 'timestamp', 'x', 'y', 'z']

## Segmentation

Let us construct 10 seconds time series segments.
* each time series should be from one user and one type of activity;
* in the time series timestamp shouldn't differ more then 0.2 second (empirical rule, in ideal all timestamp should differ on 50 ms = 0.05 second).

In [21]:
def check_candidate(candidate, data_type, threshold=2.*1e8):
    if data_type == "USCHAD":
        threshold = 0.
    tsp = np.array(candidate['timestamp'])
    diffs = tsp[1:] - tsp[:-1]
    
    return np.sum(diffs > threshold) == 0

def get_time_series(accelerations, data_type, nb=200):
    accelerations.index = [i for i in range(len(accelerations))]
    TS = []
    st = 0
    fi = st + nb
    while fi < len(accelerations):
        candidate = accelerations.loc[[st + i for i in range(nb)], :]
        if check_candidate(candidate, data_type):
            TS.append([np.array(candidate['x']), 
                       np.array(candidate['y']), 
                       np.array(candidate['z'])])
        st = fi
        fi += nb
    
    return TS

## Feature Generation

In [22]:
def get_distribution(data, df):
    classes = list(set(data['activity']))
    for activity in classes:
        nb = np.sum(df['activity'] == classes.index(activity))
        print("{:<20}{:<9d}{:<5.2f} %".format(activity, nb, 100. * nb / df.shape[0]))
    print("")
    print("Number of objects: {:d}".format(df.shape[0]))

In [23]:
def get_feature_matrix(data, data_type, get_feature_names, get_features, params=[]):
    
    classes = list(set(data['activity']))
    feature_names = get_feature_names(params)
    df = pd.DataFrame(columns=['activity']+feature_names) 

    id_range = np.unique(np.array(data['id_user']))
    for id_user in id_range:
        for activity in classes:
            mask = (data.loc[:, 'id_user'] == id_user) & (data.loc[:, 'activity'] == activity)
            accelerations = data.loc[mask, ['timestamp', 'x', 'y', 'z']].copy()
            TS = get_time_series(accelerations, data_type, nb=200)
            for ts in TS:
                features = get_features(ts, params)
                df.loc[len(df), :] = [classes.index(activity)] + features
    return df

## Feature extraction

### Expert functions

The idea is the following: we will consider 10 seconds time series (or 200 points of measurements) and calculate 40 features:
* ```[3]``` - mean acceleration of each axis;
* ```[3]``` - std of acceleration of each axis;
* ```[3]``` - mean absolute deviation of acceleration of each axis;
* ```[1]``` - mean acceleration;
* ```[30]``` - distribution of time series values of each axis. First of all we calculate min and max of each component ($X, Y, Z$) from the whole interval. Then we divide the range of values of each component into 10 equal intervals and calculate on each each interval the percent of values that are in it (in the corresponding interval).  

And apply LogisticRegression and SVM.

In [24]:
def get_expert_names(params):
    feature_names = ['avg_x', 'avg_y', 'avg_z', 
                     'std_x', 'std_y', 'std_z', 
                     'abs_x', 'abs_y', 'abs_z', 'mean']
    for i in range(10):
        name = str(i) + '_'
        feature_names += [name + 'x', name + 'y', name + 'z']
        
    return feature_names

def get_expert_features(ts, params):
    x = ts[0]
    y = ts[1]
    z = ts[2]
    n = x.shape[0]
    features = []
    features.append(x.mean())
    features.append(y.mean())
    features.append(z.mean())
    features.append(x.std())
    features.append(y.std())
    features.append(z.std())
    features.append(np.abs(x - x.mean()).mean())
    features.append(np.abs(y - y.mean()).mean())
    features.append(np.abs(z - z.mean()).mean())
    features.append((x+y+z).mean() / 3.)
    x_range = np.linspace(x.min(), x.max(), 11)
    y_range = np.linspace(y.min(), y.max(), 11)
    z_range = np.linspace(z.min(), z.max(), 11)
    for i in range(10):
        features.append(1. * np.sum((x_range[i] <= x) & (x < x_range[i+1])) / n)
        features.append(1. * np.sum((y_range[i] <= y) & (y < y_range[i+1])) / n)
        features.append(1. * np.sum((z_range[i] <= z) & (z < z_range[i+1])) / n)
    
    return features

Create and save:

In [25]:
df_expert_wisdm = get_feature_matrix(data_wisdm, 'WISDM', get_expert_names, get_expert_features)
df_expert_wisdm.to_csv("../data/features/expert_wisdm.csv", index=False)

In [26]:
#df_expert_uschad = get_feature_matrix(data_uschad, 'USCHAD', get_expert_names, get_expert_features)
#df_expert_uschad.to_csv("../data/features/expert_uschad.csv", index=False)

### Autoregression model

### Explanation

In [27]:
def get_autoregressive_names(params):
    n = params[0]
    feature_names = []
    for ax in ['x', 'y', 'z']:
        feature_names += ['intercept_' + ax]
        for i in range(n):
            feature_names += ['coef_' + str(i) + '_' + ax]
            
    return feature_names

def get_autoregressive_features(ts, params):
    n = params[0]
    x = ts[0]
    y = ts[1]
    z = ts[2]
    m = x.shape[0]
    features = []
    X = np.zeros([m-n, n])
    Y = np.zeros(m-n)
    for axis in [x, y, z]:
        for i in range(m-n):
            X[i, :] = axis[i:i+n]
            Y[i] = axis[i+n]
        lr = LinearRegression()
        lr.fit(X, Y)
        features.append(lr.intercept_)
        features.extend(lr.coef_)
    
    return features

Create and save:

In [28]:
params = [20]
df_ar_wisdm = get_feature_matrix(data_wisdm, 'WISDM', get_autoregressive_names,
                                 get_autoregressive_features, params)
df_ar_wisdm.to_csv("../data/features/ar_wisdm.csv", index=False)

In [29]:
#params = [20]
#df_ar_uschad = get_feature_matrix(data_uschad, 'USCHAD', get_autoregressive_names,
#                                  get_autoregressive_features, params)
#df_ar_uschad.to_csv("../data/features/ar_uschad.csv", index=False)

### Spectrum analysis

### Explanation

In [30]:
def get_spectrum_names(params):
    n = params[0]
    feature_names = []
    for ax in ['x', 'y', 'z']:
        for i in range(n):
            feature_names += ['eigv_' + str(i) + '_' + ax]
            
    return feature_names

def get_spectrum_features(ts, params):
    n = params[0]
    x = ts[0]
    y = ts[1]
    z = ts[2]
    m = x.shape[0]
    features = []
    X = np.zeros([m-n, n])
    Y = np.zeros(m-n)
    for axis in [x, y, z]:
        for i in range(m-n):
            X[i, :] = axis[i:i+n]
        h = sc.linalg.svd(X.T.dot(X), compute_uv=False, overwrite_a=True)
        features.extend(h)
    
    return features

Create and save:

In [31]:
params = [20]
df_ssa_wisdm = get_feature_matrix(data_wisdm, 'WISDM', get_spectrum_names,
                                  get_spectrum_features, params)
df_ssa_wisdm.to_csv("../data/features/ssa_wisdm.csv", index=False)

In [32]:
#params = [20]
#df_ssa_uschad = get_feature_matrix(data_uschad, 'USCHAD', get_spectrum_names,
#                                   get_spectrum_features, params)
#df_ssa_uschad.to_csv("../data/features/ssa_uschad.csv", index=False)

## Fast Fourier Transform

### Explanation