# Human Activity Recognition based on Smartphones sensors.
---
<b>MADS-MMS Portfolio-Exam Part 2<br>
Janosch Höfer, 938969</b>

## Table of contents

- [Introduction](#intro) <br>
- [1. Data preparation](#data-prep) <br>
- [2. Parameters](#parameters) <br>
- [3. Model setup](#model-setup) <br>
   - [3.1. K-Means](#kmean) <br>
   - [3.2. EM-Algorithm](#em)<br>
   - [3.3. OPTICS](#optics) <br>
- [4. Model Evaluation](#model-eval) <br>
- [5. Results](#results)<br>
- [References](#ref)<br>

## Introduction

bla<br>
Using [[1]](http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions)

In [None]:
# Standard libraries
import os
import itertools

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, OPTICS
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tqdm.notebook import tqdm
from yellowbrick.cluster import KElbowVisualizer

# Own classes and functions
from helper_functions.data_manipulation import setup_raw_data
from helper_functions.plot_clusters import draw_plot, OPTICSResults

In [None]:
np.seterr(divide="ignore", invalid="ignore")

---
<a id='data-prep'></a>

## 1. Data preparation

bla

In [None]:
path_to_data = "data"
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/"\
    "00544/ObesityDataSet_raw_and_data_sinthetic%20(2).zip"
filename = "ObesityDataSet_raw_and_data_sinthetic.csv"

In [None]:
# Check for data
setup_raw_data(data_url, path_to_data, filename)

In [None]:
data_full = pd.read_csv(os.path.join(path_to_data, filename), sep=",")

In [None]:
data_full

In [None]:
data_full.describe().T

In [None]:
data_full.isna().any()

### Encode

In [None]:
data_full.head()

In [None]:
encoder_cols = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "SCC",
    "CALC",
    "MTRANS",
]

In [None]:
encoder = OneHotEncoder(drop="first", sparse=False)
data_encoded = encoder.fit_transform(data_full[encoder_cols])

In [None]:
df_encoded = pd.DataFrame(data_encoded, columns=encoder.get_feature_names_out())
df_encoded.head()

### Scale

In [None]:
data_full.head()

In [None]:
scale_cols = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]

In [None]:
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_full[scale_cols])
df_scaled = pd.DataFrame(data_scaled, columns=scale_cols)
df_scaled.head()

In [None]:
df = pd.merge(df_scaled, df_encoded, left_index=True, right_index=True, how="inner")

In [None]:
df.shape

In [None]:
df.head()

### Control Target

In [None]:
data_targets = data_full["NObeyesdad"]
data_targets.shape

In [None]:
targets = data_targets
for idx, _class in enumerate(data_targets.unique()):
    targets = targets.replace(_class, idx)

In [None]:
targets.value_counts()

---
<a id='parameters'></a>

## 2. Parameters

bla

In [None]:
random_state = 42

---
<a id='split'></a>

## 3. Model setup
<a id='kmean'></a>
### 3.1. K-Means

bla

In [None]:
max_ks = 20
ks = range(2, max_ks)

In [None]:
model = KMeans(random_state=random_state, init="k-means++", max_iter=300, tol=0.0001)
visualizer = KElbowVisualizer(model, k=(2, max_ks))

In [None]:
visualizer.fit(df)
visualizer.show()

In [None]:
best_k_elbow = visualizer.elbow_value_
draw_plot(
    df,
    plot_type="silhouette",
    ks=best_k_elbow,
    random_state=random_state,
    no_zero=True,  # increases cluster labels by 1
    labels=["The silhouette coefficient values", "Cluster label"],
    title=f"Silhouette analysis for KMeans clustering with n_clusters = {best_k_elbow}",
)

In [None]:
draw_plot(
    df,
    ks=ks,
    plot_type="ksscore",
    random_state=random_state,
    labels=["K", "Silhouette Coefficient"],
    title="Silhouette Score for different Ks",
)

In [None]:
best_k_silhouette = 14
draw_plot(
    df,
    plot_type="silhouette",
    ks=best_k_silhouette,
    random_state=random_state,
    no_zero=True,  # increases cluster labels by 1
    labels=["The silhouette coefficient values", "Cluster label"],
    title=f"Silhouette analysis for KMeans clustering with n_clusters = {best_k_silhouette}",
)

<a id='em'></a>

### 3.2. EM-Algorithm (Expectation-Maximization)

In [None]:
gm = GaussianMixture(best_k_elbow)
cluster_assignments = gm.fit_predict(df)

In [None]:
set(cluster_assignments)

In [None]:
conf_matrix = confusion_matrix(y_true=targets, y_pred=cluster_assignments)
ConfusionMatrixDisplay(conf_matrix).plot()

In [None]:
gm = GaussianMixture(best_k_silhouette)
cluster_assignments = gm.fit_predict(df)
conf_matrix = confusion_matrix(y_true=targets, y_pred=cluster_assignments)
ConfusionMatrixDisplay(conf_matrix).plot()

<a id='optics'></a>

### 3.3. OPTICS (Ordering Points To Identify the Clustering Structure)

In [None]:
def optics_experiment(df, parameters: dict[str, list]):
    results = list()
    space = np.arange(len(df))

    max_len = np.prod([len(item) for item in parameters.values()])  # Iterables have no length
    for item in tqdm(itertools.product(*parameters.values()), total=max_len):
        optics_clustering = OPTICS(
            min_samples=item[0], metric=item[1], xi=item[2], min_cluster_size=item[3]
        ).fit(df)
        results.append(
            OPTICSResults(
                space=space,
                reachability=optics_clustering.reachability_[optics_clustering.ordering_],
                targets=optics_clustering.labels_[optics_clustering.ordering_],
                params=optics_clustering.get_params(),
            )
        )
    return results

In [None]:
parameters = {
    "min_samples": [15, 20],
    "metric": ["minkowski"],
    "xi": [0.2],
    "min_cluster_size": [5, 10, 20],
}

In [None]:
optics_res = optics_experiment(df, parameters)

In [None]:
draw_plot(
    optics_res,
    figsize=(16, 18),
    grid_size=(round(len(optics_res) / 2), 2),
    plot_type="reachability",
    labels=["", "Reachability distance"],
    title="Reachability Diagram",
)

---
<a id='model-eval'></a>

## 4. Model Evaluation

bla

---
<a id='results'></a>

## 5. Results

bla

---
<a id='ref'></a>

## References

<p> [1] http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions