# Human Activity Recognition based on Smartphones sensors.
---
<b>MADS-MMS Portfolio-Exam Part 2<br>
Janosch Höfer, 938969</b>

## Table of contents

- [Introduction](#intro) <br>
- [1. Data Exploration](#data-prep) <br>
    - [1.1. Data Engineering](#dataeng) <br>
    - [1.2. Data Visualization](#datavis) <br>
- [2. Parameters](#parameters) <br>
- [3. Model setup](#model-setup) <br>
   - [3.1. K-Means](#kmean) <br>
   - [3.2. HAC](#hac)<br>
   - [3.3. OPTICS](#optics) <br>
- [4. Model Evaluation](#model-eval) <br>
- [5. Results](#results)<br>
- [References](#ref)<br>

## Introduction

bla<br>
Using [[1]](http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions)

In [None]:
# Standard libraries
import os
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from sklearn.cluster import KMeans, OPTICS, AgglomerativeClustering
from tqdm.notebook import tqdm
from yellowbrick.cluster import KElbowVisualizer

# Own classes and functions
from helper_functions.data_manipulation import setup_raw_data
from helper_functions.plot_clusters import draw_plot, OPTICSResults

In [None]:
pd.set_option("display.max_columns", 25)

---
<a id='data-prep'></a>

## 1. Data Exploration
<a id='dataeng'></a>
### 1.1. Data Engineering

bla

In [None]:
path_to_data = "data"
data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00485/google_review_ratings.csv"
)
filename = "google_review_ratings.csv"

In [None]:
# Check for data
setup_raw_data(data_url, path_to_data, filename)

In [None]:
features = [
    "churches",
    "resorts",
    "beaches",
    "parks",
    "theatres",
    "museums",
    "malls",
    "zoo",
    "restaurants",
    "pubs_bars",
    "local_services",
    "burger_pizza_shops",
    "hotels_other_lodgings",
    "juice_bars",
    "art_galleries",
    "dance_clubs",
    "swimming_pools",
    "gyms",
    "bakeries",
    "beauty_spas",
    "cafes",
    "view_points",
    "monuments",
    "gardens",
    "c25",
]
data_full = pd.read_csv(
    os.path.join(path_to_data, filename), sep=",", index_col=0, names=features, header=0
)

In [None]:
data_full.head()

Remove the empty last column.

In [None]:
data_full = data_full.iloc[:, :-1]

In [None]:
data_full.describe()

Ratings between 1 and 5. 0 means that no rating has been made.

In [None]:
data_full.isna().sum()

In [None]:
data_full[data_full.isna().any(axis=1)]

In [None]:
df = data_full.dropna().copy()

In [None]:
df.dtypes

Because of the false value for User 2713 in the local services column, the data type is not float.

In [None]:
df["local_services"] = pd.to_numeric(df["local_services"])

In [None]:
df.dtypes

<a id='datavis'></a>
### 1.2. Data Visualization 

In [None]:
start, end = 0, 8
draw_plot(
    df[features[start:end]],
    plot_type="histplot",
    figsize=(16, 8),
    grid_size=(2, 4),
    title=f"Feature distribution for the features: {', '.join(features[start:end])}.",
)

In [None]:
start, end = 8, 16
draw_plot(
    df[features[start:end]],
    plot_type="histplot",
    figsize=(16, 8),
    grid_size=(2, 4),
    title=f"Feature distribution for the features: {', '.join(features[start:end])}.",
)

In [None]:
start, end = 16, 24
draw_plot(
    df[features[start:end]],
    plot_type="histplot",
    figsize=(16, 8),
    grid_size=(2, 4),
    title=f"Feature distribution for the features: {', '.join(features[start:end])}.",
)

In [None]:
norating = df[df == 0].count(axis=0) / df.shape[0] * 100

In [None]:
df_norating = (
    pd.DataFrame(norating, columns=["perc_norating"])
    .reset_index()
    .sort_values(by="perc_norating", ascending=False)
)

In [None]:
ax = sns.barplot(df_norating, x="perc_norating", y="index")
plt.xlabel("User that left no rating [%]")
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
plt.ylabel("Feature")
plt.title("Percentage of users that have not rated the feature.")
plt.show()

In [None]:
df_average = (
    pd.DataFrame(df.replace(0, np.NaN).mean(), columns=["Average"])
    .reset_index()
    .rename(columns={"index": "Feature"})
    .sort_values(by="Average", ascending=False)
)

In [None]:
palette = [
    "red" if 0 < val <= 1 else "orange" if 1 < val <= 2 else "blue" if 2 < val <= 3 else "green"
    for val in df_average["Average"].tolist()
]

sns.barplot(df_average, y="Feature", x="Average", palette=palette)
plt.xlabel("Rating")
plt.title("Average Ratings excluding zero values.")
plt.show()

In [None]:
# sns.pairplot(df)
# plt.show()

### Control Target

---
<a id='parameters'></a>

## 2. Parameters

bla

In [None]:
random_state = 42

---
<a id='split'></a>

## 3. Model setup
<a id='kmean'></a>
### 3.1. K-Means

bla

In [None]:
max_ks = 20
ks = range(2, max_ks)

In [None]:
model = KMeans(random_state=random_state, init="k-means++", max_iter=300, tol=0.0001)
visualizer = KElbowVisualizer(model, k=(2, max_ks))

In [None]:
visualizer.fit(df)
visualizer.show()

In [None]:
best_k_elbow = visualizer.elbow_value_
draw_plot(
    df,
    plot_type="silhouette",
    ks=best_k_elbow,
    random_state=random_state,
    labels=["The silhouette coefficient values", "Cluster label"],
    title=f"Silhouette analysis for KMeans clustering with n_clusters = {best_k_elbow}",
)

In [None]:
sscores = draw_plot(
    df,
    ks=ks,
    plot_type="ksscore",
    random_state=random_state,
    labels=["K", "Silhouette Coefficient"],
    title="Silhouette Score for different Ks",
)

In [None]:
best_k_silhouette = ks[sscores.index(max(sscores))]
draw_plot(
    df,
    plot_type="silhouette",
    ks=best_k_silhouette,
    random_state=random_state,
    labels=["The silhouette coefficient values", "Cluster label"],
    title=f"Silhouette analysis for KMeans clustering with n_clusters = {best_k_silhouette}",
)

<a id='hac'></a>

### 3.2. HAC (Hierarchical Agglomerative Clustering)

In [None]:
dendo_distance = "single"
dendo_cut = 200
dendo_model = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=1,
    affinity="euclidean",
    linkage=dendo_distance,
    compute_distances=True,
)
dendo_model.fit_predict(df)
draw_plot(
    dendo_model,
    plot_type="dendo",
    dendo_cut=dendo_cut,
    dendo_distance=dendo_distance,
    labels=["Samples", "Distance"],
    title=f"Dendogram using {dendo_distance}-link.",
)

In [None]:
dendo_distance = "complete"
cut = 6800
dendo_model = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=1,
    affinity="euclidean",
    linkage=dendo_distance,
    compute_distances=True,
)
dendo_model.fit_predict(df)
draw_plot(
    dendo_model,
    plot_type="dendo",
    dendo_cut=cut,
    dendo_distance=dendo_distance,
    labels=["Samples", "Distance"],
    title=f"Dendogram using {dendo_distance}-link.",
)

In [None]:
dendo_distance = "average"
dendo_cut = 3500
dendo_model = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=1,
    affinity="euclidean",
    linkage=dendo_distance,
    compute_distances=True,
)
dendo_model.fit_predict(df)
draw_plot(
    dendo_model,
    plot_type="dendo",
    dendo_cut=dendo_cut,
    dendo_distance=dendo_distance,
    labels=["Samples", "Distance"],
    title=f"Dendogram using {dendo_distance}-link.",
)

In [None]:
dendo_model = AgglomerativeClustering(distance_threshold=None, n_clusters=5, linkage="average")
labels_dendo = dendo_model.fit_predict(df)

In [None]:
set(labels_dendo)

In [None]:
df["dendo_labels"] = labels_dendo

In [None]:
# sns.pairplot(df, hue="dendo_labels")
# plt.show()

<a id='optics'></a>

### 3.3. OPTICS (Ordering Points To Identify the Clustering Structure)

In [None]:
def optics_experiment(df, parameters: dict[str, list]):
    results = list()
    space = np.arange(len(df))

    max_len = np.prod([len(item) for item in parameters.values()])  # Iterables have no length
    for item in tqdm(itertools.product(*parameters.values()), total=max_len):
        optics_clustering = OPTICS(
            min_samples=item[0], metric=item[1], xi=item[2], min_cluster_size=item[3]
        ).fit(df)
        results.append(
            OPTICSResults(
                space=space,
                reachability=optics_clustering.reachability_[optics_clustering.ordering_],
                targets=optics_clustering.labels_[optics_clustering.ordering_],
                params=optics_clustering.get_params(),
            )
        )
    return results

In [None]:
parameters = {
    "min_samples": [40, 50],
    "metric": ["minkowski", "euclidean"],
    "xi": [0.05, 0.025],
    "min_cluster_size": [0.1, 0.2],
}

In [None]:
optics_res = optics_experiment(df, parameters)

In [None]:
draw_plot(
    optics_res,
    figsize=(16, 18),
    grid_size=(round(len(optics_res) / 2), 2),
    plot_type="reachability",
    labels=["", "Reachability distance"],
    # top_cut_off=3,
    title="Reachability Diagram",
)

In [None]:
optics_res[-4].targets

In [None]:
draw_plot(df, hue=optics_res[-4].targets)

---
<a id='model-eval'></a>

## 4. Model Evaluation

bla

---
<a id='results'></a>

## 5. Results

bla

---
<a id='ref'></a>

## References

<p> [1] http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions