# Распознавание активности (Human Activity Recognition - HAR)

С.Ю. Папулин (papulin.study@yandex.ru)

### Содержание

- [Анализ исходных данных]()
- [Построение модели распознавания активности]()
- [Выбор модели]()

Подключение модулей

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
RANDOM_STATE = 1234

## Анализ исходных данных

Описание: [Human Activity Recognition Using Smartphones Data Set](https://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones)

Ссылка: [UCI HAR Dataset.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip)

Загрузка исходных данных

In [None]:
YOUR_PATH = "/YOUR_PATH/UCI HAR Dataset"

In [None]:
# Наименования активностей
LABEL_NAMES_FILE = f"{YOUR_PATH}/activity_labels.txt"

# Наименование столбцов признаков
FEATURE_NAMES_FILE = f"{YOUR_PATH}/features.txt"

# Идентификаторы испытуемых
X_TRAIN_SUBJECT_FILE = f"{YOUR_PATH}/train/subject_train.txt"


# Признаки (временные, частотные) и целевых значения (активности)

# Обучающая часть
X_TRAIN_FILE = f"{YOUR_PATH}/train/X_train.txt"
Y_TRAIN_FILE = f"{YOUR_PATH}/train/y_train.txt"

# Тестовая часть
X_TEST_FILE = f"{YOUR_PATH}/test/X_test.txt"
Y_TEST_FILE = f"{YOUR_PATH}/test/y_test.txt"

In [None]:
# Загрузка наименований активностей
LABEL_NAMES = list()
with open(LABEL_NAMES_FILE, "r") as fin:
    for line in fin:
        LABEL_NAMES.append(line.split()[1])
LABEL_NAMES

In [None]:
# Загрузка наименований столбцов
CLMS = list()
with open(FEATURE_NAMES_FILE, "r") as fin:
    for line in fin:
        CLMS.append(line.split()[1])
len(CLMS)

In [None]:
CLMS = [str(indx+1) +"." + el for indx, el in enumerate(CLMS)]

In [None]:
# Вывод нескольких наименований
CLMS[:5]

In [None]:
# Загрузка идентификаторов испытуемых
df_subjects = pd.read_csv(X_TRAIN_SUBJECT_FILE, header=None, sep="\s+", names=["subject"])
df_subjects.head()

In [None]:
# Загрузка признаков
df_features = pd.read_csv(X_TRAIN_FILE, header=None, sep="\s+", names=CLMS)
df_features.head()

In [None]:
# Загрузка целевых значений
df_labels = pd.read_csv(Y_TRAIN_FILE, header=None, names=["activity"])
df_labels.head(5)

In [None]:
# Формирование одного датафрейма
df = pd.concat([df_subjects, df_features, df_labels], axis=1)
df.head()

Отображение количества различных активностей

In [None]:
fig, ax = plt.subplots(1, 1)

fig.set_figheight(4)
fig.set_figwidth(6)


ax = df.groupby("activity").size().plot.bar(ax=ax)
ax.set_xticklabels(LABEL_NAMES, rotation=60)
ax.set_ylabel("window count")
ax.grid(True)

Распредление активностей 5го испытуемого

In [None]:
SUBJECT_ID = 5

In [None]:
fig, ax = plt.subplots(1, 1)

fig.set_figheight(4)
fig.set_figwidth(6)

ax = df[df["subject"]==SUBJECT_ID]\
    .groupby("activity")\
    .size()\
    .plot.bar(ax=ax)
ax.set_xticklabels(LABEL_NAMES, rotation=60)
ax.set_ylabel("window count")
ax.grid(True)

Отображение данных от акселерометра по координатам

In [None]:
ACC_CLMS = ["1.tBodyAcc-mean()-X", "2.tBodyAcc-mean()-Y", "3.tBodyAcc-mean()-Z", "activity"]

df_acc = df.loc[
    df["subject"]==SUBJECT_ID,
    ACC_CLMS
]

df_acc.index = range(len(df_acc))
df_acc.head()

In [None]:
fig, axes = plt.subplots(3, 1)

fig.set_figheight(12)
fig.set_figwidth(12)

for indx, ax in enumerate(axes):

    df_acc[ACC_CLMS[indx]].plot(ax=ax, color="grey")

    ax.set_title("Subject {}: {}".format(SUBJECT_ID, ACC_CLMS[indx]))
    ax.set_xlabel("window index")
    ax.set_ylabel("acc")

    for i in range(1, len(LABEL_NAMES)+1):
        df_acc[df_acc["activity"]==i][ACC_CLMS[indx]].plot(
            marker="o", linestyle="", ax=ax, 
            label=LABEL_NAMES[i-1]
        )

    ax.grid(True)
    ax.legend()

plt.tight_layout()
plt.show()

## Построение модели распознавания активности

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
import sys
sys.path.insert(0, "../lib/")
from plot_confusion_matrix import plot_confusion_matrix

In [None]:
from sklearn.utils import shuffle

In [None]:
df_ = shuffle(df, random_state=RANDOM_STATE)
df_.head()

In [None]:
# Построение модели
model = LogisticRegression(penalty="l2", 
                           max_iter=100, 
                           solver="newton-cg", 
                           multi_class="multinomial",
                           random_state=RANDOM_STATE)

# Обучение
model.fit(df_[CLMS], df_["activity"])

Базовая отметка

In [None]:
# TODO

Проверка на тестовом множестве

In [None]:
# Загрузка тестового множества
df_test_features = pd.read_csv(X_TEST_FILE, header=None, sep="\s+", names=CLMS)
df_test_labels = pd.read_csv(Y_TEST_FILE, header=None, names=["activity"])

df_test = pd.concat([df_test_features, df_test_labels], axis=1)
df_test.head()

In [None]:
# Педсказанные значения
df_test["pred"] = model.predict(df_test[CLMS])
df_test[["pred", "activity"]].head()

In [None]:
# Педсказанные значения
df_test["pred"] = model.predict(df_test[CLMS])

# Расчет доли правильных классификаций
accuracy = model.score(df_test[CLMS], df_test["activity"])
print("Accuracy = {}\n".format(accuracy))

# Вывод других метрик
print(classification_report(df_test["activity"], 
                            df_test["pred"], 
                            target_names=LABEL_NAMES))

In [None]:
# Вывод матрицы ошибок
ax = plot_confusion_matrix(df_test["activity"]-1,
                           df_test["pred"]-1,
                           classes=np.array(LABEL_NAMES),
                           figsize=[8,8])

## Выбор модели

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
kf = StratifiedKFold(n_splits=3)

In [None]:
models = dict()

# k-ближайших соседей
models["knn"] = (
    KNeighborsClassifier(), {
        "n_neighbors": [5, 11]
    })

# Логистическая регрессия
models["logreg"] = (
    LogisticRegression(
        penalty="l2", 
        solver="newton-cg", 
        multi_class="multinomial",
        random_state=1234), {
        "C": [0.1, 1]  
    })

# Случайный лес
models["rforest"] = (
    RandomForestClassifier(
        criterion="gini",
        random_state=RANDOM_STATE), {
        "n_estimators": [50, 100]  
    })

In [None]:
df_result = pd.DataFrame(columns=["params", "accuracy"])

for name, (model, params) in models.items():
    grid = GridSearchCV(estimator=model, 
                        param_grid=params, 
                        cv=kf,
                        verbose=2)
    grid.fit(df_[CLMS], df_["activity"])
    df_result.loc[model.__class__.__name__] = (
        grid.best_params_,
        grid.score(df_test[CLMS], df_test["activity"]))

In [None]:
# Вывод лучших моделей, их параметров и доли правильных классификаций
df_result.head()