#  ACC and GYR - physical activity detection

## Joni Rajamaki and Usairim Isani



In [56]:
%xmode Minimal
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob
import tsfel

# Machine learning packages - scikit-learn
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeClassifier

Exception reporting mode: Minimal


In [57]:
data_path = "./Data"
train_data_path = f'{data_path}/Train'
test_data_path = f'{data_path}/Test'
extracted_features_path = "./Features"

In [58]:
train_dir = os.listdir(train_data_path)

In [59]:
def read_data(data_path):
    folders = {}
    for folder in os.listdir(data_path):
        samples = {}
        for file in os.listdir(f'{data_path}/{folder}'):
            with open(f'{data_path}/{folder}/{file}') as f:
                sample_df = pd.read_csv(f)
                samples[file]=sample_df
        folders[folder] = samples
    return folders


{
    walking: {
        subject: TimeSeries
        ...
    }
}

In [60]:
train_data = read_data(train_data_path)
test_data = read_data(test_data_path)

In [61]:
cfg_file = tsfel.get_features_by_domain()  

In [62]:
def extract_features(data, out):
    for label in data:
        for subject in data[label]:
            subject_df = data[label][subject]
            extracted_features = tsfel.time_series_features_extractor(cfg_file, subject_df, fs=50, verbose=0)
            extracted_features.insert(0,"Label", label)
            extracted_features.insert(0,"Subject",subject)
            extracted_features.to_csv(f'{extracted_features_path}/{out}/{subject}.csv') 

In [63]:
# extract_features(train_data, "Train")
# extract_features(test_data, "Test")

In [64]:
def data_to_csv(path, name):
    all_files = glob.glob(f'{extracted_features_path}/{path}/*.csv')
    extracted_features_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    extracted_features_df.to_csv(f'{extracted_features_path}/{name}.csv')

In [65]:
# data_to_csv("Train", "training")
# data_to_csv("Test", "testing")

In [66]:
training_features = pd.read_csv(f'{extracted_features_path}/training.csv')
testing_features = pd.read_csv(f'{extracted_features_path}/testing.csv')

In [67]:
training_features.sample(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Subject,Label,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,...,5_Wavelet variance_0,5_Wavelet variance_1,5_Wavelet variance_2,5_Wavelet variance_3,5_Wavelet variance_4,5_Wavelet variance_5,5_Wavelet variance_6,5_Wavelet variance_7,5_Wavelet variance_8,5_Zero crossing rate
720,720,0,sample_ID9_exp17_119.csv,walking,277.727581,5.097597,277.727581,2.512193,50.0,200.0,...,0.035425,0.250115,0.598492,0.846733,0.781093,0.554708,0.424361,0.441524,0.537452,29.0
567,567,0,sample_ID19_exp38_256.csv,walking,274.854768,5.006514,274.854768,2.494784,50.0,200.0,...,0.054129,0.288014,0.676733,1.011803,1.126566,1.044649,0.926329,0.856284,0.83426,34.0
1009,1009,0,sample_ID4_exp7_36.csv,walking_downstairs,282.393774,5.069653,282.393774,2.475411,50.0,200.0,...,0.056566,0.081338,0.104977,0.143814,0.180355,0.200952,0.207762,0.208853,0.208879,64.0
302,302,0,sample_ID19_exp39_287.csv,standing,256.348806,5.042611,256.348806,2.496129,50.0,200.0,...,6.7e-05,0.000309,0.000869,0.001984,0.003932,0.007005,0.011386,0.017163,0.02434,11.0
724,724,0,sample_ID19_exp38_211.csv,walking_downstairs,316.880126,4.881695,316.880126,2.615064,50.0,200.0,...,0.139305,0.512759,0.7354,0.882023,0.842572,0.731451,0.632131,0.548005,0.47657,52.0
374,374,0,sample_ID10_exp21_138.csv,walking,279.300107,5.084056,279.300107,2.527138,50.0,200.0,...,0.037052,0.187619,0.419136,0.725356,0.999243,1.139016,1.172455,1.151671,1.113139,40.0
520,520,0,sample_ID17_exp35_242.csv,standing,246.754259,4.947611,246.754259,2.492343,50.0,200.0,...,2e-05,2.7e-05,3.1e-05,3.8e-05,5e-05,6.7e-05,8.7e-05,0.000112,0.00014,60.0
141,141,0,sample_ID6_exp11_71.csv,standing,256.937184,5.047778,256.937184,2.485787,1.009722,1.009722,...,0.002415,0.006094,0.008117,0.007891,0.006907,0.006052,0.005316,0.00496,0.005122,49.0
581,581,0,sample_ID2_exp3_12.csv,sitting,256.644461,5.045695,256.644461,2.490506,1.013889,1.013889,...,8.8e-05,0.000197,0.000386,0.000685,0.0012,0.001834,0.002404,0.002754,0.00282,57.0
761,761,0,sample_ID14_exp29_169.csv,sitting,258.555573,5.064306,258.555573,2.48361,50.0,200.0,...,0.000222,0.001232,0.002924,0.005148,0.007727,0.010611,0.013812,0.017334,0.021061,35.0


In [68]:
testing_features.sample(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Subject,Label,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,...,5_Wavelet variance_0,5_Wavelet variance_1,5_Wavelet variance_2,5_Wavelet variance_3,5_Wavelet variance_4,5_Wavelet variance_5,5_Wavelet variance_6,5_Wavelet variance_7,5_Wavelet variance_8,5_Zero crossing rate
731,731,0,sample_ID27_exp55_402.csv,sitting,258.800811,5.066861,258.800811,2.488525,50.0,200.0,...,2e-05,4.9e-05,7e-05,9.3e-05,0.000125,0.000172,0.00023,0.000292,0.00035,60.0
226,226,0,sample_ID27_exp54_427.csv,standing,260.38832,5.082403,260.38832,2.490089,50.0,200.0,...,2.7e-05,3.8e-05,4.2e-05,4.5e-05,5.2e-05,6.9e-05,9.9e-05,0.000141,0.000196,67.0
279,279,0,sample_ID28_exp57_459.csv,standing,250.818743,4.987972,250.818743,2.491907,50.0,200.0,...,4e-05,0.000195,0.000561,0.000972,0.001207,0.001238,0.001158,0.001055,0.000966,51.0
812,812,0,sample_ID29_exp59_472.csv,standing,262.712943,5.104959,262.712943,2.490175,50.0,200.0,...,0.000108,0.000919,0.002791,0.005372,0.00784,0.009464,0.010093,0.010059,0.009768,33.0
460,460,0,sample_ID27_exp54_428.csv,standing,260.532794,5.083861,260.532794,2.489557,50.0,200.0,...,2.1e-05,0.000113,0.000352,0.00061,0.000695,0.0006,0.000445,0.000322,0.000261,43.0
573,573,0,sample_ID23_exp47_352.csv,lying,0.758978,0.268458,0.758978,2.936169,50.0,200.0,...,2e-05,2.7e-05,3.2e-05,3e-05,2.4e-05,2.1e-05,2e-05,1.9e-05,1.8e-05,112.0
56,56,0,sample_ID29_exp58_426.csv,sitting,232.812139,4.805806,232.812139,2.489061,50.0,200.0,...,6.3e-05,0.000166,0.000255,0.000431,0.000751,0.0012,0.001701,0.002173,0.002559,63.0
486,486,0,sample_ID25_exp51_284.csv,walking_downstairs,269.524743,5.021486,269.524743,2.519985,50.0,200.0,...,0.018761,0.115886,0.192409,0.246449,0.287009,0.320943,0.347895,0.359716,0.35267,34.0
146,146,0,sample_ID23_exp47_278.csv,walking_upstairs,236.410188,4.632347,236.410188,2.50088,50.0,200.0,...,0.017872,0.105347,0.246207,0.390922,0.544993,0.749284,1.002004,1.274539,1.543646,30.0
240,240,0,sample_ID30_exp61_368.csv,walking_upstairs,254.798506,4.88057,254.798506,2.533841,50.0,200.0,...,0.015898,0.092573,0.204074,0.39226,0.65863,0.950929,1.213191,1.411889,1.542301,24.0


In [69]:
# Separate the labels to their own dataframe
testing_labels = testing_features["Label"]
training_labels = training_features["Label"]

# Drop labels from rest of data
testing_features_no_labels = testing_features.drop(["Label", "Subject", "Unnamed: 0.1", "Unnamed: 0"], axis=1)
training_features_no_labels = training_features.drop(["Label","Subject","Unnamed: 0","Unnamed: 0.1"], axis=1)

# Check data
testing_features_no_labels.head(10)
training_features_no_labels.head(10)

Unnamed: 0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,0_ECDF_1,...,5_Wavelet variance_0,5_Wavelet variance_1,5_Wavelet variance_2,5_Wavelet variance_3,5_Wavelet variance_4,5_Wavelet variance_5,5_Wavelet variance_6,5_Wavelet variance_7,5_Wavelet variance_8,5_Zero crossing rate
0,258.546808,5.064347,258.546808,2.488045,50.0,200.0,1.0125,1.019445,0.004,0.008,...,9.6e-05,0.000322,0.000491,0.001015,0.001961,0.003295,0.004961,0.00684,0.008765,35.0
1,248.411002,4.845931,248.411002,2.516621,50.0,200.0,0.775,1.236111,0.004,0.008,...,0.025688,0.121122,0.395748,0.716097,0.805651,0.652813,0.459236,0.363558,0.369806,27.0
2,308.46397,4.970222,308.46397,2.583284,50.0,200.0,0.572222,1.509722,0.004,0.008,...,0.051682,0.23025,0.42902,0.624145,0.828155,0.945226,0.90747,0.76403,0.604243,38.0
3,262.718817,4.963459,262.718817,2.50343,50.0,200.0,0.797222,1.206945,0.004,0.008,...,0.039753,0.120835,0.171568,0.28574,0.36108,0.359866,0.32608,0.302461,0.298562,43.0
4,175.813698,4.175834,175.813698,2.486844,50.0,200.0,0.834722,0.841667,0.004,0.008,...,0.000105,0.000285,0.000408,0.000628,0.00086,0.001013,0.001091,0.001117,0.001107,70.0
5,260.968683,5.087459,260.968683,2.487137,50.0,200.0,1.009722,1.033333,0.004,0.008,...,0.001112,0.00614,0.012684,0.018878,0.025162,0.030615,0.034739,0.038041,0.0413,41.0
6,279.114259,4.888556,279.114259,2.490672,50.0,200.0,0.648611,1.359722,0.004,0.008,...,0.028888,0.16482,0.327814,0.50115,0.654962,0.749854,0.750384,0.662547,0.533853,29.0
7,266.083682,4.967153,266.083682,2.51487,50.0,200.0,0.773611,1.1625,0.004,0.008,...,0.036333,0.195023,0.386463,0.601361,0.755011,0.797205,0.768983,0.7117,0.648024,38.0
8,0.114606,0.072583,0.114606,3.819868,50.0,200.0,0.002778,0.020833,0.004,0.008,...,0.000255,0.00201,0.003693,0.004204,0.004528,0.004711,0.004758,0.004714,0.004597,58.0
9,1.527849,0.384806,1.527849,2.440198,50.0,200.0,-0.0875,-0.065278,0.004,0.008,...,2.4e-05,3.4e-05,5.2e-05,5.5e-05,4.5e-05,3.4e-05,3.2e-05,3.6e-05,4.5e-05,85.0


In [70]:
# Standardize data (default value is 0)
scaler = StandardScaler()
training_features_scaled = scaler.fit_transform(training_features_no_labels)
testing_features_scaled = scaler.transform(testing_features_no_labels)

In [71]:
training_features.shape

(1170, 1564)

In [72]:
training_labels.shape

(1170,)

In [73]:
# Train KNN model
train_knn = KNeighborsClassifier(n_neighbors=3) #define the model
train_knn.fit(training_features_scaled, training_labels) #train/fit model and data
predictions_knn = train_knn.predict(testing_features_scaled) #predictions

print(metrics.confusion_matrix(testing_labels, predictions_knn)) #print confusion matrix with labels_train vs. the predictions
acc = metrics.accuracy_score(testing_labels, predictions_knn) #get accuracy score
print("accuracy:",acc) #print accuracy score

[[ 81   3   0   0   0   0]
 [  0 155  14   0   0   0]
 [  0  20 168   0   0   0]
 [  0   0   0 143   1   0]
 [  0   0   0   2 115   2]
 [  0   0   0   0   2 126]]
accuracy: 0.9471153846153846


In [None]:
# Trying different k-values (0-20)
accuracies = []
for k in range(20)
    train_knn = KNeighborsClassifier(n_neighbors=k) #define the model
    train_knn.fit(training_features_scaled, training_labels) #train/fit model and data
    predictions_knn = train_knn.predict(testing_features_scaled) #predictions

    #print(metrics.confusion_matrix(testing_labels, predictions_knn)) #print confusion matrix with labels_train vs. the predictions
    acc = metrics.accuracy_score(testing_labels, predictions_knn) #get accuracy score
    #print("accuracy:",acc) #print accuracy score
    accuracies.append(acc)

plt.plot(accuracies, i)
plt.xlabel('acc score')
plt.ylabel('k-value')

In [74]:
# Train RidgeClassifier
ridge = RidgeClassifier()
ridge.fit(training_features_scaled, training_labels)
prediction_ridge = ridge.predict(testing_features_scaled)
print(metrics.confusion_matrix(testing_labels, prediction_ridge)) #print confusion matrix with labels_train vs. the predictions
acc = metrics.accuracy_score(testing_labels, prediction_ridge) #get accuracy score
print("accuracy:",acc) #print accuracy score

[[ 81   3   0   0   0   0]
 [  0 167   2   0   0   0]
 [  0  10 178   0   0   0]
 [  0  18   4 122   0   0]
 [  0  18  17   0  81   3]
 [  0   7  12   0   0 109]]
accuracy: 0.8870192307692307


In [None]:
# AUROC and ROC Curves
#https://www.w3schools.com/python/python_ml_auc_roc.asp used this as an example.
def plot_roc_curve(true_y, predictions_knn):
    fpr, tpr, thresholds = metrics.roc_curve(true_y, predictions) #fpr as false positive rate ja tpr as true positive rate
    roc_auc = metrics.auc(fpr, tpr) #area under curve
    plt.plot(fpr,tpr) # plot false positive vs true positive line
    plt.plot([0,1], [0,1], color='r', linestyle="--") # random guess line
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    print("AUROC =", roc_auc) #print auroc score
    
plot_roc_curve(testing_labels, predictions_knn) #call function with knn
plot_roc_curve(testing_labels, predictions_ridge) #call function with ridge

# Discussion 
- Features 
  - Study more about the features we have 
  - Use a smaller subset
  - read more docs for tsfel
- Accuracy ? 
  - Over fitting
  - Just too many features makes accurate
- Evaluation of the model
  - auc-roc curve
  - F1Score
  - confidence score

