# Mini Project #2
##### 스마트폰 수집 신호를 이용한 **인간 행위** 인식


In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")


from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from datetime import datetime

# Data Analyzing

In [2]:

import pandas as pd
ADL_DATA_PATH = "../datasets/train.csv"

def load_ADL_data(adl_path=ADL_DATA_PATH):
    return pd.read_csv(adl_path)

adl = load_ADL_data()
adl.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING


In [3]:
adl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 563 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), int64(1), object(1)
memory usage: 31.6+ MB


In [4]:
adl.describe()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject
count,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,...,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0,7352.0
mean,0.274488,-0.017695,-0.109141,-0.605438,-0.510938,-0.604754,-0.630512,-0.526907,-0.60615,-0.468604,...,-0.307009,-0.625294,0.008684,0.002186,0.008726,-0.005981,-0.489547,0.058593,-0.056515,17.413085
std,0.070261,0.040811,0.056635,0.448734,0.502645,0.418687,0.424073,0.485942,0.414122,0.544547,...,0.321011,0.307584,0.336787,0.448306,0.608303,0.477975,0.511807,0.29748,0.279122,8.975143
min,-1.0,-1.0,-1.0,-1.0,-0.999873,-1.0,-1.0,-1.0,-1.0,-1.0,...,-0.995357,-0.999765,-0.97658,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
25%,0.262975,-0.024863,-0.120993,-0.992754,-0.978129,-0.980233,-0.993591,-0.978162,-0.980251,-0.936219,...,-0.542602,-0.845573,-0.121527,-0.289549,-0.482273,-0.376341,-0.812065,-0.017885,-0.143414,8.0
50%,0.277193,-0.017219,-0.108676,-0.946196,-0.851897,-0.859365,-0.950709,-0.857328,-0.857143,-0.881637,...,-0.343685,-0.711692,0.009509,0.008943,0.008735,-0.000368,-0.709417,0.182071,0.003181,19.0
75%,0.288461,-0.010783,-0.097794,-0.242813,-0.034231,-0.262415,-0.29268,-0.066701,-0.265671,-0.017129,...,-0.126979,-0.503878,0.150865,0.292861,0.506187,0.359368,-0.509079,0.248353,0.107659,26.0
max,1.0,1.0,1.0,1.0,0.916238,1.0,1.0,0.967664,1.0,1.0,...,0.989538,0.956845,1.0,1.0,0.998702,0.996078,1.0,0.478157,1.0,30.0


In [5]:
adl_activity = adl[["Activity"]]
adl_activity.head()

Unnamed: 0,Activity
0,STANDING
1,STANDING
2,STANDING
3,STANDING
4,STANDING


#### Category 형 Label 을 LabelEncoder() 를 통해 예측 가능한 형태로 인코딩

In [6]:
from sklearn.preprocessing import LabelEncoder

cat_encoder = LabelEncoder()
adl_label = cat_encoder.fit_transform(adl_activity)
adl_label

  return f(**kwargs)


array([2, 2, 2, ..., 5, 5, 5])

In [7]:
adl["Activity"] = adl_label

In [8]:
corr_matrix = adl.corr()
corr_matrix["Activity"].sort_values(ascending=True)

tBodyAcc-min()-X            -0.802259
tBodyAcc-min()-Y            -0.781408
tBodyAccJerk-min()-X        -0.778826
tBodyGyro-min()-Z           -0.776684
tGravityAcc-min()-Y         -0.749414
                               ...   
tBodyAcc-sma()               0.835621
tBodyAccJerk-entropy()-Y     0.837034
tBodyGyroJerk-entropy()-Z    0.844754
fBodyAccJerk-entropy()-X     0.845190
Activity                     1.000000
Name: Activity, Length: 563, dtype: float64

In [9]:
temp = adl.copy()
temp = temp.drop("subject", axis=1)
adl_labels = temp["Activity"]
adl_prepared = temp.drop("Activity", axis=1)

# Evaluating Models

In [10]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [12]:
seed = 42
scoring = 'accuracy'
num_folds = KFold(n_splits=10, shuffle=True, random_state=seed)

#### 대표적인 예측(분류) 모델들의 성능 평가

In [37]:
models = []
models.append(('SVM', SVC(kernel='linear')))
models.append(('RF', RandomForestClassifier(random_state=seed, n_estimators=100)))
models.append(('ET', ExtraTreesClassifier(random_state=seed, n_estimators=100)))
models.append(('AB', AdaBoostClassifier(random_state=seed)))
results = []
names = []
metrics = []

In [38]:
for name, model in models:
    cv_results = cross_val_score(model, adl_prepared, adl_labels, cv=num_folds, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    metrics.append(cv_results.mean())
    msg = "%s: %f%%" % (name, cv_results.mean() * 100)
    print(msg)
print ('Average metrics (Accuracy) from all models:',np.mean(metrics))

SVM: 98.585422%
RF: 98.095737%
ET: 98.680549%
AB: 54.488539%
Average metrics (Accuracy) from all models: 0.8746256192694468


# Hyper Parameters Tuning

#### ExtraTreesClassifier 의 성능이 가장 좋은 것으로 보여지므로, 하이퍼 파라미터 튜닝 시도 (기본 : 98.68%)

In [40]:
from sklearn.model_selection import GridSearchCV

et_clf = ExtraTreesClassifier(random_state=seed, n_estimators=100)

param_grid={
    'n_estimators':[130, 150, 170, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [None, 1, 2, 4, 8, 16]
}

gcv = GridSearchCV(et_clf, param_grid=param_grid, cv=num_folds, scoring=scoring)

gcv.fit(adl_prepared, adl_labels)
print('Best Parmas', gcv.best_params_)
print('Best Score', gcv.best_score_)

Best Parmas {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 150}
Best Score 0.9878935595977522


#### 하이퍼 파라미터 튜닝 결과 정확도 98.68 -> 98.79 로 향상

In [42]:
et_clf = ExtraTreesClassifier(random_state=seed, n_estimators=150, min_samples_split=2)
et_scores = cross_val_score(et_clf, adl_prepared, adl_labels,
                                scoring=scoring, cv=num_folds)

display_scores(et_scores)

Scores: [0.99048913 0.99184783 0.99047619 0.98095238 0.98367347 0.98503401
 0.99319728 0.98095238 0.99183673 0.99047619]
Mean: 0.9878935595977522
Standard deviation: 0.00449151790579966


#### ExtraTree 이외에도 Gradient Boosting 등의 시도를 위해 XGBClassifier 사용

In [14]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators=100, learning_rate=0.08, gamma=0)
cv_results = cross_val_score(xgb_clf, adl_prepared, adl_labels, cv=num_folds, scoring=scoring)

display_scores(cv_results)

Scores: [0.99048913 0.99048913 0.98503401 0.9877551  0.98911565 0.99183673
 0.99319728 0.98639456 0.99047619 0.99183673]
Mean: 0.9896624519372968
Standard deviation: 0.0024498475717152058


#### XGBClassifier 가 ExtraTree 보다 더 나은 정확도를 보이므로 하이퍼 파라미터 튜닝 시도

In [17]:
from sklearn.model_selection import GridSearchCV

xgb_clf = XGBClassifier()

param_grid={
    'n_estimators':[50, 100, 200, 300],
    'max_depth': [None, 10, 30, 50]
}

gcv = GridSearchCV(xgb_clf, param_grid=param_grid, cv=num_folds, scoring=scoring)

gcv.fit(adl_prepared, adl_labels)
print('Best Parmas', gcv.best_params_)
print('Best Score', gcv.best_score_)

Best Parmas {'max_depth': None, 'n_estimators': 300}
Best Score 0.9931991274770778


In [18]:
xgb_clf = XGBClassifier(n_estimators=300)
cv_results = cross_val_score(xgb_clf, adl_prepared, adl_labels, cv=num_folds, scoring=scoring)

display_scores(cv_results)

Scores: [0.99456522 0.99184783 0.99319728 0.98639456 0.99047619 0.99591837
 0.99455782 0.99319728 0.99727891 0.99455782]
Mean: 0.9931991274770778
Standard deviation: 0.0029178723317281547


### 최종 Accuracy : 99.32%