# Human Activity Recognition based on Smartphones sensors.
---
<b>MADS-MMS Portfolio-Exam Part 2<br>
Janosch Höfer, 938969</b>

## Table of contents

- [Introduction](#intro) <br>
- [1. Data preparation](#data-prep) <br>
- [2. Parameters](#parameters) <br>
- [3. Model setup and Crossvalidation](#model-setup) <br>
   - [3.1. Bla?](#NN) <br>
- [4. Model Evaluation](#model-eval) <br>
- [5. Results](#results)<br>
- [References](#ref)<br>

## Introduction

bla<br>
Using [[1]](http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions)

In [None]:
# Standard libraries
import os

import numpy as np
import pandas as pd
import re
from pca import pca
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Own classes and functions
from helper_functions.data_manipulation import (
    create_activity_window,
    open_raw_data,
    setup_raw_data,
)

---
<a id='data-prep'></a>

## 1. Data preparation

bla

In [None]:
path_to_data = "data"  # dir where the zip has been extracted
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00341/HAPT%20Data%20Set.zip"
experiment_types = ["acc", "gyro"]

In [None]:
# Check for data
setup_raw_data(data_url, path_to_data)

In [None]:
labels = pd.read_csv(
    os.path.join(path_to_data, "RawData", "labels.txt"),
    header=None,
    sep=" ",
    names=["experiment_id", "user_id", "activity_id", "start", "end"],
)

In [None]:
labels.head()

Example data

In [None]:
# Example code used by underlying functions
experiment_data = {
    exp_type: open_raw_data(path_to_data, exp_type, 1, 1) for exp_type in experiment_types
}
exp_types = list(experiment_data.keys())
first_type = exp_types.pop(0)
_df = pd.DataFrame(experiment_data[first_type]).add_prefix(f"{first_type}_")
for sub_type in exp_types:
    _df = pd.merge(
        _df,
        pd.DataFrame(experiment_data[sub_type]).add_prefix(f"{sub_type}_"),
        left_index=True,
        right_index=True,
    )
_df.head()

Example data single row

In [None]:
# Example code used by underlying functions
agg_funcs = [np.mean, np.min, np.max]
df_stacked = _df.iloc[250:500, :].agg(agg_funcs).stack().swaplevel()
df_stacked.index = df_stacked.index.map("{0[1]}_{0[0]}".format)
df_stacked.to_frame().T

In [None]:
agg_functions = [np.mean, np.min, np.max]

exp_results = list()
for (exp_id, user_id), activity_df in labels.groupby(["experiment_id", "user_id"]):
    experiment_data = {
        exp_type: open_raw_data(path_to_data, exp_type, exp_id, user_id)
        for exp_type in experiment_types
    }
    for act_idx in range(len(activity_df)):
        _df = create_activity_window(
            experiment_data,
            agg_functions,
            activity_df.iloc[act_idx]["start"],
            activity_df.iloc[act_idx]["end"],
        )
        _df["exp_id"] = exp_id
        _df["user_id"] = user_id
        _df["activity_id"] = activity_df.iloc[act_idx]["activity_id"]
        exp_results.append(_df)
df = pd.concat(exp_results)

In [None]:
df

Loading the prepared data.

In [None]:
feature_dir = os.path.join(path_to_data, "features.txt")
with open(feature_dir) as file:
    features = [row.rstrip("\n").rstrip() for row in file]

In [None]:
len(features), len(set(features))

In [None]:
replace_dict = {"ropy": [1, ""], "Kurtosis": [1, ""], "Skewness": [1, ""]}
new_features = list()
for row in features:
    reg_idx = re.findall("[^-]*[^-]", row)[1]
    if replace_dict.get(reg_idx):
        if row == replace_dict[reg_idx][1]:
            replace_dict[reg_idx][0] += 1
            row = row.replace("-1", f"-{replace_dict[reg_idx][0]}")
        else:
            replace_dict[reg_idx][0] = 1
            replace_dict[reg_idx][1] = row
    new_features.append(row)

In [None]:
len(features), len(set(new_features))

In [None]:
data_path = os.path.join(path_to_data, "Train", "X_train.txt")
df_full = pd.read_csv(data_path, sep=" ", header=None, names=new_features)

In [None]:
data_dict = {"561_features": df_full}
df_full.head()

---
<a id='parameters'></a>

## 2. Parameters

bla

In [None]:
prop_cycle = plt.rcParams["axes.prop_cycle"]
colors = prop_cycle.by_key()["color"]

random_state = 42

---
<a id='split'></a>

## 3. Model setup and Crossvalidation
<a id='NN'></a>
### 3.1 Principle Component Analysis

bla

In [None]:
df_full.cov().round(2)

In [None]:
import sklearn.decomposition.PCA as skPCA

pca_m = skPCA()
data_pca_transformed = pca_m.fit_transform(df_full)
df_pca = pd.DataFrame(data_pca_transformed, index=df_full.index)
df_pca

In [None]:
df_pca.cov().round(2)

In [None]:
expl_var = pca.explained_variance_ratio_
expl_var.shape

In [None]:
plt.plot(expl_var, label="expl. var.")
plt.plot(np.add.accumulate(expl_var), label="acc. expl. var.")
plt.legend()
plt.show()

In [None]:
n_components = [50, 75, 100, 125]
for _n in n_components:
    model = pca(n_components=_n)
    # Fit transform
    results = model.fit_transform(df_pca)
    data_dict[f"{_n}_PCA_features"] = results["PC"]

In [None]:
# explained variance
fig, ax = model.plot()
plt.show()

In [None]:
fig, ax = model.scatter(legend=False, PC=[0, 1], label=None)

### 3.2 Clustering

In [None]:
cluster_result_dict = dict()

In [None]:
max_ks = 20
ks = range(2, max_ks)
for key, _df in data_dict.items():
    sscore = list()
    for k in tqdm(ks):
        kkm = KMeans(
            n_clusters=k, random_state=random_state, init="k-means++", max_iter=300, tol=0.0001
        )
        cluster_labels = kkm.fit_predict(_df)
        sscore.append(silhouette_score(_df, cluster_labels))
    cluster_result_dict[key] = sscore

In [None]:
for idx, (key, _sscore) in enumerate(cluster_result_dict.items()):
    plt.plot(ks, _sscore, color=colors[idx], label=key)
plt.xlim(2, max_ks)
plt.legend()
plt.show()

---
<a id='model-eval'></a>

## 4. Model Evaluation

bla

In [None]:
from sklearn import cluster

agg = cluster.AgglomerativeClustering(
    n_clusters=7, affinity="euclidean", linkage="complete", compute_distances=True
)
dendo_labels = agg.fit_predict(df_full)

In [None]:
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster import hierarchy


def create_linkage(agg):
    n_samples = len(agg.labels_)
    counts = np.zeros(agg.children_.shape[0])
    for i, merge in enumerate(agg.children_):
        current_count = 0
        # print (i, merge)
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    linkage_matrix = np.column_stack([agg.children_, agg.distances_, counts]).astype(float)
    return linkage_matrix


hierarchy.set_link_color_palette(["red", "green", "blue"])
plt.figure(figsize=(18, 5))
dendrogram(
    create_linkage(agg),
    color_threshold=6,
    orientation="left",
    # labels=data_dendo.index,
    no_labels=True,
    leaf_rotation=0,
)
plt.show()

---
<a id='results'></a>

## 5. Results

bla

---
<a id='ref'></a>

## References

<p> [1] http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions