#### Imports Block

In [1]:
# -- IMPORTS START --
import pandas as pd
import glob
import re
import os
import sys
import pickle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn import tree, metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from scipy.signal import butter, filtfilt, find_peaks
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.model_selection import train_test_split
# -- IMPORTS END --

# enable zooming into graphs
%matplotlib widget
plt.rcParams['figure.figsize'] = [9, 6] # width, height in inches

### Helper Function: viz_tree (do not modify)

In [2]:
# Helper function to visualize model - Do not modify
def viz_tree(dt_model,features_frames,cnames):
    # Fix feature names as list
    feature_names = features_frames.columns.tolist()

    fig, ax = plt.subplots(figsize=(9,4))
    tree.plot_tree(dt_model,  
                   feature_names=feature_names,
                   fontsize=7,
                   class_names=cnames,
                   filled=True,
                   ax=ax)

    plt.title('Decision Tree')
    plt.savefig('dt.png')

### Helper Function: calc_magnitude (do not modify)

In [3]:
#Do not modify
def calc_magnitude(data):

    # Calculate magnitude  
    data['accel_mag'] = np.sqrt(data['x']**2 + data['y']**2 + data['z']**2) # absolute accel magnitude
    data['accel_mag'] = data['accel_mag'] - data['accel_mag'].mean() # detrend: "remove gravity"

    return data

### Helper Function: remove noise (do not modify)

In [4]:
#Do not modify
def remove_noise(data,sampling_rate):
    from scipy.signal import butter, filtfilt, find_peaks

    # Low pass filter
    cutoff = 5 # Hz
    order = 2
    b, a = butter(order, cutoff/(sampling_rate/2), btype='lowpass')
    data['filtered_accel_mag'] = filtfilt(b, a, data['accel_mag'])

    return data

### Helper Function: add_features (do not modify)

In [5]:
#Do not modify
def add_features(window: pd.DataFrame):
    features = {}
    features['avg'] = window['accel_mag'].mean()
    features['max'] = window['accel_mag'].quantile(1)
    features['med'] = window['accel_mag'].quantile(0.5)
    features['min'] = window['accel_mag'].quantile(0)
    features['q25'] = window['accel_mag'].quantile(0.25)
    features['q75'] = window['accel_mag'].quantile(0.75)
    features['std'] = window['accel_mag'].std()
    # df = pd.DataFrame()
    # df = df.append(features,ignore_index=True)
    df = pd.DataFrame([features])
    
    return df

### Helper Function: train_decision_tree (do not modify)

In [6]:
def train_decision_tree(frames):
    # Extract feature columns 
    X = frames[['avg', 'max', 'med', 'min', 'q25', 'q75', 'std']]

    # Extract target column
    y = frames['activity']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 

    # Create model
    dt_model = DecisionTreeClassifier(criterion='entropy',max_depth=5).fit(X_train, y_train)
    dt_pred = dt_model.predict(X_test)

    # Evaluate on test set
    acc = dt_model.score(X_test, y_test)
    dt_cm = confusion_matrix(y_test, dt_pred, labels=dt_model.classes_)
    print(classification_report(y_test, dt_pred))
    print("Accuracy on test set:", acc)

    return dt_model,dt_cm,acc

### Helper Function: classify_live_window (do not modify)

In [7]:
def classify_live_window(df):
    
    # Filter accelerometer data 
    df_accel = df[df['accel_x'].notna() & df['accel_y'].notna() & df['accel_z'].notna()]
    df_valid = df_accel[['accel_x', 'accel_y', 'accel_z']].rename(columns={
      'accel_x': 'x',
      'accel_y': 'y',
      'accel_z': 'z'  
    })

    # Calculate accel_mag
    df_valid = calc_magnitude(df_valid) 

    # Add features
    df_valid = add_features(df_valid) 
    X = df_valid[['avg', 'max', 'med', 'min', 'q25', 'q75',  'std']] 

    # Load model
    with open('dt_model.pkl', 'rb') as f:
        model = pickle.load(f)
        
    # Make prediction
    y_pred = model.predict(df_valid)

    return(y_pred)

### Testing the live model (do not modify)

In [8]:
def test_live_classification(): # Testing the live model
    # Generate sample DataFrame
    data = {'accel_x': [0.011531], 
            'accel_y': [0.002931],
            'accel_z': [0.019604],
            'time': ['2023-08-01 18:40:43.344408']}

    df = pd.DataFrame(data)

    # Repeat rows to get 1000 rows
    df = pd.concat([df]*1000, ignore_index=True) 

    # Call function
    y_pred = classify_live_window(df)

    print(y_pred)


In [9]:
# Function to extract windows and features
def extract_features(data, window_sec, sample_rate, activity):
    # TODO - see instructions above
    window_data = data.resample(str(window_sec) + "S")

    dataFrameToReturn = pd.DataFrame(
        columns=["avg", "max", "med", "min", "q25", "q75", "std", "label"],
    )

    for time, window in window_data:
        features = add_features(window)
        features["label"] = [activity]
        dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")

    return dataFrameToReturn

In [10]:
def all_data_to_combined_csv():
    import os
    import sys

    folders = glob.glob("data/Activities/*")[1:]
    all_data = pd.DataFrame(
        columns=["avg", "max", "med", "min", "q25", "q75", "std", "label"],
    )

    all_data_list = []

    for folder in folders:
        activity = os.path.basename(folder)
        file_names = glob.glob("data/Activities/" + activity + "/*.csv")

        for file in file_names:
            file_data = pd.read_csv(file, parse_dates=["time"], index_col="time")

            window_sec = 10
            sampling_rate = 100

            file_data = calc_magnitude(file_data)
            file_data = remove_noise(file_data, sampling_rate)
            file_data = extract_features(file_data, window_sec, sampling_rate, activity)

            all_data_list.append(file_data)

    all_data = pd.concat(all_data_list, join="outer")

    all_data.to_csv(
        "./data/Activities/all_data.csv",
        columns=["avg", "max", "med", "min", "q25", "q75", "std", "label"],
        index=False,
    )

# Experimenting with Different Activity Combinations

| Model trained on | Accuracy |
|-|-|
| Three Types of Walking | 0.7056 |
| Stairs Activities | 0.8846 | 
| Static Activities | 0.6253 |
| Mobile Activities | 0.6026 |
| All Activities | 0.6483 |
|-|-|

In [11]:
all_data_to_combined_csv()

feature_frames = pd.read_csv("data/Activities/all_data.csv")

# ['downstairs','jogging','lying','sitting','standing','upstairs','walk_fast','walk_mod','walk_slow']
# drop_activities = ['downstairs','jogging','lying','sitting','standing','upstairs']
# drop_activities = ['jogging','lying','sitting','standing','walk_fast','walk_mod','walk_slow']
# drop_activities = ['downstairs','jogging','upstairs','walk_fast','walk_mod','walk_slow']
# drop_activities = ['lying','sitting','standing']
# drop_activities = []

drop_activities_list = [
    ["downstairs", "jogging", "lying", "sitting", "standing", "upstairs"],
    ["jogging", "lying", "sitting", "standing", "walk_fast", "walk_mod", "walk_slow"],
    ["downstairs", "jogging", "upstairs", "walk_fast", "walk_mod", "walk_slow"],
    ["lying", "sitting", "standing"],
    []
]

all_activities = [
    "downstairs",
    "jogging",
    "lying",
    "sitting",
    "standing",
    "upstairs",
    "walk_fast",
    "walk_mod",
    "walk_slow",
]

feature_names = [
    "avg",
    "max",
    "med",
    "min",
    "q25",
    "q75",
    "std",
]

for drop_activities in drop_activities_list:
    used_activities = []
    for act in all_activities:
        if act not in drop_activities:
            used_activities.append(act)

    print(used_activities)

    masked_feature_frames = pd.read_csv("data/Activities/all_data.csv")

    for index, row in masked_feature_frames.iterrows():
        if row["label"] in drop_activities:
            masked_feature_frames.drop(index=index, inplace=True)

    # This function will print out precision/recall/accuracy
    features = masked_feature_frames[feature_names]
    labels = masked_feature_frames["label"]

    x_train, x_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.3, random_state=42
    )
    dt_model = DecisionTreeClassifier(max_depth=5, criterion="entropy").fit(
        x_train, y_train
    )
    y_pred = dt_model.predict(x_test)

    acc = accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(
        y_test, y_pred, average=None, labels=used_activities
    )
    recall = metrics.recall_score(y_test, y_pred, average=None, labels=used_activities)

    print("Accuracy: ", acc, "\n", "Precision: ", precision, "\n", "Recall: ", recall)

    with open("dt_model.pkl", "wb") as f:
        pickle.dump(dt_model, f)

    dt_cm = confusion_matrix(y_test, y_pred, labels=used_activities)
    print("Confusion Matrix: \n", dt_cm)

    fig, ax = plt.subplots(figsize=(70, 7))

    tree.plot_tree(
        dt_model,
        feature_names=feature_names,
        fontsize=7,
        class_names=used_activities,
        filled=True,
        ax=ax,
    )

    plt.savefig("_".join(used_activities) + ".jpg", dpi=600)

    plt.close()  # Prevent the plot from showing in notebook

  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")
  dataFrameToReturn = pd.concat([dataFrameToReturn, features], join="inner")

['walk_fast', 'walk_mod', 'walk_slow']
Accuracy:  0.7058823529411765 
 Precision:  [0.84  0.575 0.8  ] 
 Recall:  [0.67741935 0.82142857 0.61538462]
Confusion Matrix: 
 [[21  9  1]
 [ 2 23  3]
 [ 2  8 16]]
['downstairs', 'upstairs']
Accuracy:  0.8846153846153846 
 Precision:  [0.85714286 0.91666667] 
 Recall:  [0.92307692 0.84615385]
Confusion Matrix: 
 [[12  1]
 [ 2 11]]
['lying', 'sitting', 'standing']
Accuracy:  0.6823529411764706 
 Precision:  [0.69230769 0.7        0.65517241] 
 Recall:  [0.5625     0.77777778 0.73076923]
Confusion Matrix: 
 [[18  6  8]
 [ 4 21  2]
 [ 4  3 19]]
['downstairs', 'jogging', 'upstairs', 'walk_fast', 'walk_mod', 'walk_slow']
Accuracy:  0.5960264900662252 
 Precision:  [0.39130435 0.94444444 0.         0.45833333 0.37037037 0.7027027 ] 
 Recall:  [0.52941176 0.89473684 0.         0.47826087 0.37037037 0.78787879]
Confusion Matrix: 
 [[ 9  0  3  4  1  0]
 [ 1 34  0  3  0  0]
 [ 0  0  0  3  9  1]
 [ 5  2  0 11  4  1]
 [ 6  0  1  1 10  9]
 [ 2  0  0  2  3 2

  _warn_prf(average, modifier, msg_start, len(result))


> **Interpretation of the results**: What do these accuracy scores suggest about the ability of the model to distinguish between these activities? Do some activities appear to be more distinguishable than others? How do different combinations of activities affect the accuracy? Remember to provide a brief discussion for each point. 

Classifying the three different types of walking was reasonably accurate with ~70% accuracy. Intuitively, walking faster will be more irregular. Therefore, we would expect the standard deviation of the signal to be greater. This is exactly what the decision tree is looking for in the root node.

Classification of walking up and down stairs had the highest accuracy of almost 90%. Intuitively, walking upstairs usually involves softer steps, whereas walking downstairs involves harder steps that must counteract the momentum generated from gravity. Examining the decision tree, we notice that the root decides based on the minimum acceleration. Following the intuition, the amplitude of the acceleration signal will be greater for walking downstairs rather than upstairs, so it makes sense that the greatest information gain involves the min/max of acceleration. This difference likely leads to the high accuracy of this classification.

The static activities were classified with similar accuracy to the three different types of walking. Intuitively, standing will have more motion than sitting or lying down. Therefore, the root node checks the standard deviation, like the walking classifier.

Considering the first three training sets, it seems that it is easiest to distinguish the walking up and down stairs than walking at different speeds or between stationary activities.

Classifying the mobile activities had the lowest accuracy. This classifier had the challenge of classifying six activities (second only to classifying all of the activities). Examining the leaves of the decision tree, several of them have many samples with a large entropy. It is possible that the tree was limited by the maximum depth. The accuracy may be improved by using a greater maximum depth. For this classifier, it may be preferrable to use minimum leaf nodes instead of maximum depth to prevent overfitting.

Classifying all activities leads to similar issues with classifying all of the mobile activities. Comparing this to training on all of the mobile activities, the increased in accuracy (60% vs 65%) in the former suggests that it is easier to distinguish activities with motion from stationary motions. 

If  there are too many activities to classify, then the decision tree will struggle due to its limited depth. This is evidenced by how the classification for three or less activities (walking, stairs, and static) had accuracies of >70%, whereas classifying mobile and all activities had accuracies of 60% and 65%, respectively.

In [12]:
test_live_classification()

['lying']


# Live Demonstration

link: 

https://drive.google.com/file/d/1xEKujxMv5Vq9XRNLQ2QNZ3B9cwI1UXQj/view?usp=sharing