# Dependencies

In [4]:
import sys
from datetime import datetime, timedelta
import copy
from operator import itemgetter
import os
from multiprocessing import Pool, cpu_count
from pathlib import Path
import itertools
import glob
import time

from scipy.stats import pearsonr
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning
from Chapter7.Evaluation import ClassificationEvaluation
from Chapter7.Evaluation import RegressionEvaluation
from Chapter7.LearningAlgorithms import ClassificationAlgorithms
from Chapter7.LearningAlgorithms import RegressionAlgorithms

from util.VisualizeDataset import VisualizeDataset


# Feature Selection

We perform selection on 20% of the dataset and will begin with random forest across all targets: age, activity label, height and weight. Whatever target seems to be most predictable we will apply other algorithms to.

In [37]:
dataset = pd.read_csv("dataset_gran_250.csv", index_col=0)
display(dataset[dataset.id.isna()])

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,userAcceleration.x,userAcceleration.y,userAcceleration.z,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,act,id,weight,height,age,gender,trial,timestamp


## Random forest - Age - Regression

In [20]:
# we need to split the dataset into 80% trainset and 20% test set
# we can't just split it randomly because the data of a single participant needs to stay together
train_x = dataset[dataset["id"] < 19].drop(columns="age")
train_y = dataset[dataset["id"] < 19].age

test_x = dataset[dataset["id"] > 18].drop(columns="age")
test_y = dataset[dataset["id"] > 18].age

print("participants in train set: ", train_x.id.unique())
print("participants in test set: ", test_x.id.unique())

participants in train set:  [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18.]
participants in test set:  [19. 20. 21. 22. 23.]


In [22]:
# predict age label with a regression algorithm
pred_training_y, pred_test_y = RegressionAlgorithms().random_forest(train_x, train_y, test_x)

In [38]:
# calculate a score - compare pred_test_y and test_y
print(pred_training_y)
print(pred_test_y, len(pred_test_y))
print(test_y, len(test_y))

# for the sake of completeness also compare pred_training_y and train_y
evaluator_age = RegressionEvaluation()
print("MSE: ", evaluator_age.mean_squared_error(test_y, pred_test_y))
print("MSE and STD: ", evaluator_age.mean_squared_error_with_std(test_y, pred_test_y))
print("MAE: ", evaluator_age.mean_absolute_error(test_y, pred_test_y))
print("MAE and STD: ", evaluator_age.mean_absolute_error_with_std(test_y, pred_test_y))

print("MSE: ", evaluator_age.mean_squared_error(train_y, pred_training_y))
print("MSE and STD: ", evaluator_age.mean_squared_error_with_std(train_y, pred_training_y))
print("MAE: ", evaluator_age.mean_absolute_error(train_y, pred_training_y))
print("MAE and STD: ", evaluator_age.mean_absolute_error_with_std(train_y, pred_training_y))

[46. 46. 46. ... 28. 28. 28.]
[26.5 26.5 26.5 ... 27.8 27.8 27.8] 22884
0        25.0
250      25.0
500      25.0
750      25.0
1000     25.0
         ... 
66500    18.0
66750    18.0
67000    18.0
67250    18.0
67500    18.0
Name: age, Length: 22884, dtype: float64 22884
MSE:  35.71824244013286
MSE and STD:  (35.71824244013286, 42.2945744502565)
MAE:  4.475030589057857
MAE and STD:  (4.475030589057857, 3.961442847169949)
MSE:  0.0
MSE and STD:  (0.0, 0.0)
MAE:  0.0
MAE and STD:  (0.0, 0.0)


## Random forest - Label - Classification

In [32]:
# we need to split the dataset into 80% trainset and 20% test set
# we can't just split it randomly because the data of a single participant needs to stay together
train_x_act = dataset[dataset["id"] < 19].drop(columns="act")
train_y_act = dataset[dataset["id"] < 19].act

test_x_act = dataset[dataset["id"] > 18].drop(columns="act")
test_y_act = dataset[dataset["id"] > 18].act

print("participants in train set: ", train_x_act.id.unique())
print("participants in test set: ", test_x_act.id.unique())

participants in train set:  [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18.]
participants in test set:  [19. 20. 21. 22. 23.]


In [33]:
# predict activity label with a classification algorithm
pred_training_y_act, pred_test_y_act, frame_prob_training_y_act, frame_prob_test_y_act = ClassificationAlgorithms().random_forest(train_x_act, train_y_act, test_x_act)

In [43]:
# calculate a score - compare pred_test_y and test_y
print(pred_test_y_act, len(pred_test_y_act))
print(test_y_act, len(test_y_act))
print(test_y_act.unique())
# for the sake of completeness also compare pred_training_y and train_y
evaluator_label = ClassificationEvaluation()
print("Accuracy ", evaluator_label.accuracy(test_y_act, pred_test_y_act))
print("Precision: ", evaluator_label.precision(test_y_act, pred_test_y_act))
print("Recall: ", evaluator_label.recall(test_y_act, pred_test_y_act))
print("f1: ", evaluator_label.f1(test_y_act, pred_test_y_act))
print("Confusion Matrix \n", evaluator_label.confusion_matrix(test_y_act, pred_test_y_act, test_y_act.unique()))

[0. 0. 0. ... 5. 5. 5.] 22884
0        0.0
250      0.0
500      0.0
750      0.0
1000     0.0
        ... 
66500    5.0
66750    5.0
67000    5.0
67250    5.0
67500    5.0
Name: act, Length: 22884, dtype: float64 22884
[0. 1. 2. 3. 4. 5.]
Accuracy  0.9743488900541863
Precision:  [1.         0.9759542  0.9529511  0.90468227 1.         0.99648252]
Recall:  [0.93599637 0.93938281 0.98073586 0.94497817 0.9913573  1.        ]
f1:  [0.96694021 0.95731936 0.96664386 0.92439129 0.99565989 0.99823816]
Confusion Matrix 
 [[2062   54   31   56    0    0]
 [   0 2557   97   53    0   15]
 [   0    0 5651  110    0    1]
 [   0    5  120 2164    0    1]
 [   0    4   31    9 5047    0]
 [   0    0    0    0    0 4816]]


## Random forest - Weight - Regression

In [44]:
# we need to split the dataset into 80% trainset and 20% test set
# we can't just split it randomly because the data of a single participant needs to stay together
train_x_weight = dataset[dataset["id"] < 19].drop(columns="weight")
train_y_weight = dataset[dataset["id"] < 19].weight

test_x_weight = dataset[dataset["id"] > 18].drop(columns="weight")
test_y_weight = dataset[dataset["id"] > 18].weight

print("participants in train set: ", train_x_weight.id.unique())
print("participants in test set: ", test_x_weight.id.unique())

participants in train set:  [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18.]
participants in test set:  [19. 20. 21. 22. 23.]


In [46]:
pred_training_y_weight, pred_test_y_weight = RegressionAlgorithms().random_forest(train_x_weight, train_y_weight, test_x_weight)

In [53]:
# calculate a score - compare pred_test_y and test_y
print(test_y_weight.unique())
# print(pred_test_y_weight)

evaluator_weight = RegressionEvaluation()
print("MSE: ", evaluator_weight.mean_squared_error(test_y_weight, pred_test_y_weight))
print("MSE and STD: ", evaluator_weight.mean_squared_error_with_std(test_y_weight, pred_test_y_weight))
print("MAE: ", evaluator_weight.mean_absolute_error(test_y_weight, pred_test_y_weight))
print("MAE and STD: ", evaluator_weight.mean_absolute_error_with_std(test_y_weight, pred_test_y_weight))

[ 88.  52. 100.  68.  74.]
[76. 76. 76. ... 96. 96. 96.]
MSE:  405.3278727495194
MSE and STD:  (405.3278727495194, 307.08189015325604)
MAE:  17.832179688865583
MAE and STD:  (17.832179688865583, 9.345857753837656)


## Random forest - Gender - Classification

In [48]:
# we need to split the dataset into 80% trainset and 20% test set
# we can't just split it randomly because the data of a single participant needs to stay together
train_x_gender = dataset[dataset["id"] < 19].drop(columns="gender")
train_y_gender = dataset[dataset["id"] < 19].gender

test_x_gender = dataset[dataset["id"] > 18].drop(columns="gender")
test_y_gender = dataset[dataset["id"] > 18].gender

print("participants in train set: ", train_x_weight.id.unique())
print("participants in test set: ", test_x_weight.id.unique())

participants in train set:  [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18.]
participants in test set:  [19. 20. 21. 22. 23.]


In [49]:
pred_training_y_gender, pred_test_y_gender, frame_prob_training_y_gender, frame_prob_test_y_gender = ClassificationAlgorithms().random_forest(train_x_gender, train_y_gender, test_x_gender)

In [50]:
# for the sake of completeness also compare pred_training_y and train_y
evaluator_gender = ClassificationEvaluation()
print("Accuracy ", evaluator_gender.accuracy(test_y_gender, pred_test_y_gender))
print("Precision: ", evaluator_gender.precision(test_y_gender, pred_test_y_gender))
print("Recall: ", evaluator_gender.recall(test_y_gender, pred_test_y_gender))
print("f1: ", evaluator_gender.f1(test_y_gender, pred_test_y_gender))
print("Confusion Matrix \n", evaluator_gender.confusion_matrix(test_y_gender, pred_test_y_gender, test_y_gender.unique()))

Accuracy  0.7587397308162909
Precision:  [0.60502218 1.        ]
Recall:  [1.         0.61731476]
f1:  [0.7539113  0.76338233]
Confusion Matrix 
 [[8906 5521]
 [   0 8457]]


# Feature selection

#### Initial Train split for feature selection

In [None]:
train = dataset.sample(frac=0.2,random_state=200)
train_x = train.drop(columns=['act'])
train_y = train['act']

#### Forward Selection

In [None]:
 selected_features, ordered_features, ordered_scores = FeatureSelectionClassification().forward_selection(max_features, train_X, train_y)

# Experiments - By Trial

In [None]:
trials = [dataset[dataset['trial']==i] for i in dataset['trial'].unique()]

In [None]:
class_alg = ClassificationAlgorithms()

def generate_sets(data_set):
    train=dataset.sample(frac=0.3) #random state is a seed value
    test=dataset.drop(train.index)
    test= test.sample(frac=0.3)

    train_y = train['act']
    train_X = train.drop(columns=['act'])
    test_y = test['act']
    test_X = test.drop(columns=['act'])
    
    return train_y, train_X, test_y, test_X


def train_classification(train_y, train_X, test_y, test_X, class_alg):
    pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y = class_alg.random_forest(train_X, train_y, test_X)
    return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

def evaluate_classifcation(y_true, y_pred):
    evaluation = ClassificationEvaluation()
    return evaluation.accuracy(y_true, y_pred), evaluation.precision(y_true, y_pred), evaluation.recall(y_true, y_pred), evaluation.f1(y_true, y_pred)



Random forest for the labels by trial

In [None]:
acc = []
prec = []
recall = []
f1 = []

for subset in trials:
    train_y, train_X, test_y, test_X = generate_sets(subset)
    pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y = train_classification(train_y, train_X, test_y, test_X, class_alg)
    accuracy, precision, recall, f1_ = evaluate_classifcation(test_y, pred_test_y)
    acc.append(accuracy)
    prec.append(precision)
    recall.append(recall)
    f1.append(f1_)