<a href="https://colab.research.google.com/github/Ivan-Nebogatikov/HumanActivityRecognition/blob/master/Activity_recognition_with_3_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

features = pd.read_csv('/final_ecg_data.csv')
features = features[features['ECG'] > 0.]
activities = list(sorted(set(features['activity'])))
print("Activities:", activities)
#features['activity'] = list(map(lambda x: activities.index(x), features['activity']))
features = features[features['activity'] != 'down the stairs']
features = features[features['activity'] != 'up the stairs']
#print(features.head(5)) 
print('The shape of our features is:', features.shape)

features.describe()

Activities: ['down the stairs', 'inactive', 'run', 'up the stairs', 'walk']
The shape of our features is: (87235, 2)


Unnamed: 0,ECG
count,87235.0
mean,0.056951
std,0.04065
min,3.1e-05
25%,0.032104
50%,0.04361
75%,0.085968
max,0.124969


In [0]:
import numpy as np
labels = np.array(features['activity']) # значения, которые надо будет предсказывать
features= features.drop('activity', axis = 1)
feature_list = list(features.columns)
features = np.array(features)

Разделяем на обучающую и тестовую выборки

In [0]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.15, random_state = 543)
print('Training Features Shape:', train_features.shape)
print('Testing Features Shape:', test_features.shape)

Training Features Shape: (74149, 1)
Testing Features Shape: (13086, 1)


Строим случайный лес

In [0]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 1) # Классификатор с 100 деревьями
rf.fit(train_features, train_labels);


Вспомогательная функция для построения таблицы с AUC

In [0]:
import pandas as pd
import numpy as np
from scipy import interp

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer

def class_report(y_true, y_pred, y_score=None, average='micro'):
    if y_true.shape != y_pred.shape:
        print("Error! y_true %s is not the same shape as y_pred %s" % (
              y_true.shape,
              y_pred.shape)
        )
        return

    print("Accuracy:", accuracy_score(y_true, y_pred))

    lb = LabelBinarizer()

    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(
        y_pred,
        return_counts=True)
    n_classes = 5
    pred_cnt = pd.Series(cnt, index=labels)

    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            labels=labels)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index,
        columns=labels)

    support = class_report_df.loc['support']
    total = support.sum() 
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred'] = pred_cnt
    class_report_df['pred'].iloc[-1] = total

    if not (y_score is None):
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for label_it, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), 
                y_score[:, label_it])

            roc_auc[label] = auc(fpr[label], tpr[label])

        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), 
                    y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                        lb.transform(y_true).ravel(), 
                        y_score.ravel())

            roc_auc["avg / total"] = auc(
                fpr["avg / total"], 
                tpr["avg / total"])

        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([
                fpr[i] for i in labels]
            ))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr

            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])

        class_report_df['AUC'] = pd.Series(roc_auc)

    print(class_report_df)

Проверка на тестовых данных. 

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn import metrics

predictions = list(rf.predict(test_features))
pred_prob = rf.predict_proba(test_features)

class_report(
    y_true=test_labels, 
    y_pred=np.asarray(predictions), 
    y_score=pred_prob, average='micro')

Accuracy: 0.6665902491211982
             precision    recall  f1-score  support     pred       AUC
inactive      0.616322  0.702295  0.656506   1656.0   1887.0  0.923518
run           0.754862  0.641971  0.693855   5804.0   4936.0  0.802214
walk          0.612167  0.681479  0.644966   5626.0   6263.0  0.724526
avg / total   0.675982  0.666590  0.668110  13086.0  13086.0  0.845300



---

Сравним с методом опорных векторов

In [0]:
from sklearn import svm
clf = svm.SVC(probability=True, class_weight='balanced')
clf.fit(train_features, train_labels)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [0]:
predictions_svm = list(clf.predict(test_features))
pred_prob_svm = clf.predict_proba(test_features)

class_report(
    y_true=test_labels, 
    y_pred=np.asarray(predictions_svm), 
    y_score=pred_prob_svm, average='micro')

Accuracy: 0.6144734831117225
             precision    recall  f1-score  support     pred       AUC
inactive      0.386822  0.960749  0.551569   1656.0   4113.0  0.931720
run           0.805437  0.617677  0.699171   5804.0   4451.0  0.816909
walk          0.633569  0.509243  0.564643   5626.0   4522.0  0.730616
avg / total   0.678572  0.614473  0.622656  13086.0  13086.0  0.851574


---

Accuracy ниже, как и в случае с 5 классами.

Точность для каждого класса выше, чем в методе SVM с 5 классами, поскольку классы больше различаются.