In [22]:
import pandas as pd
import matplotlib.pyplot as plt

Extract features and the following data

In [23]:
features_url = "https://raw.githubusercontent.com/KnightChaser/ML-challenge/main/EX_challenging_problems/08_HAR/UCI_HAR_Smartphone_dataset/features.txt"

In [24]:
features_name_df = pd.read_csv(features_url,
                               sep = '\s+',     # regex that matches every non-whitespace string
                               header = None,
                               names = ['column_index', 'column_name'])

In [25]:
features_name_df

Unnamed: 0,column_index,column_name
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y
...,...,...
556,557,"angle(tBodyGyroMean,gravityMean)"
557,558,"angle(tBodyGyroJerkMean,gravityMean)"
558,559,"angle(X,gravityMean)"
559,560,"angle(Y,gravityMean)"


In [34]:
features_name = features_name_df.iloc[:,1].values.tolist()
features_name

['tBodyAcc-mean()-X',
 'tBodyAcc-mean()-Y',
 'tBodyAcc-mean()-Z',
 'tBodyAcc-std()-X',
 'tBodyAcc-std()-Y',
 'tBodyAcc-std()-Z',
 'tBodyAcc-mad()-X',
 'tBodyAcc-mad()-Y',
 'tBodyAcc-mad()-Z',
 'tBodyAcc-max()-X',
 'tBodyAcc-max()-Y',
 'tBodyAcc-max()-Z',
 'tBodyAcc-min()-X',
 'tBodyAcc-min()-Y',
 'tBodyAcc-min()-Z',
 'tBodyAcc-sma()',
 'tBodyAcc-energy()-X',
 'tBodyAcc-energy()-Y',
 'tBodyAcc-energy()-Z',
 'tBodyAcc-iqr()-X',
 'tBodyAcc-iqr()-Y',
 'tBodyAcc-iqr()-Z',
 'tBodyAcc-entropy()-X',
 'tBodyAcc-entropy()-Y',
 'tBodyAcc-entropy()-Z',
 'tBodyAcc-arCoeff()-X,1',
 'tBodyAcc-arCoeff()-X,2',
 'tBodyAcc-arCoeff()-X,3',
 'tBodyAcc-arCoeff()-X,4',
 'tBodyAcc-arCoeff()-Y,1',
 'tBodyAcc-arCoeff()-Y,2',
 'tBodyAcc-arCoeff()-Y,3',
 'tBodyAcc-arCoeff()-Y,4',
 'tBodyAcc-arCoeff()-Z,1',
 'tBodyAcc-arCoeff()-Z,2',
 'tBodyAcc-arCoeff()-Z,3',
 'tBodyAcc-arCoeff()-Z,4',
 'tBodyAcc-correlation()-X,Y',
 'tBodyAcc-correlation()-X,Z',
 'tBodyAcc-correlation()-Y,Z',
 'tGravityAcc-mean()-X',
 'tGravityA

In [80]:
X_train_url = 'https://raw.githubusercontent.com/KnightChaser/ML-challenge/main/EX_challenging_problems/08_HAR/UCI_HAR_Smartphone_dataset/train/X_train.txt'
X_test_url  = 'https://raw.githubusercontent.com/KnightChaser/ML-challenge/main/EX_challenging_problems/08_HAR/UCI_HAR_Smartphone_dataset/test/X_test.txt'

In [81]:
# It looks like features_name have multiple duplications.
# In this case, it'd be better to change the name manually, appending some number

from collections import Counter
from itertools import count


c = Counter(features_name)
iters = {k: count(1) for k, v in c.items() if v > 1}
features_name_processed = [f"{x}{str(next(iters[x]))}" if x in iters else x for x in features_name]

In [86]:
X_train = pd.read_csv(X_train_url,
                      header = None,
                      delim_whitespace = True,
                      names = features_name_processed)

X_test = pd.read_csv(X_test_url,
                     header = None,
                     delim_whitespace = True,
                     names = features_name_processed)

In [89]:
y_train_url = 'https://raw.githubusercontent.com/KnightChaser/ML-challenge/main/EX_challenging_problems/08_HAR/UCI_HAR_Smartphone_dataset/train/y_train.txt'
y_test_url  = 'https://raw.githubusercontent.com/KnightChaser/ML-challenge/main/EX_challenging_problems/08_HAR/UCI_HAR_Smartphone_dataset/test/y_test.txt'

In [101]:
y_train = pd.read_csv(y_train_url,
                      header = None,
                      delim_whitespace = True)

y_test = pd.read_csv(y_test_url,
                     header = None,
                     delim_whitespace = True)

In [102]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7352, 561), (2947, 561), (7352, 1), (2947, 1))

In [104]:
X_train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [105]:
y_train.head()

Unnamed: 0,0
0,5
1,5
2,5
3,5
4,5


### Decision Tree Classifier (DTC)

In [109]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_classifier = DecisionTreeClassifier(random_state = 0xCAFE, max_depth = 4)

params = {
    'max_depth' : [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
}

grid_cv = GridSearchCV(dt_classifier,
                       param_grid = params,
                       scoring = 'accuracy',            # measure performance by accuracy (classification)
                       cv = 5,                          # 5 fold
                       return_train_score = True,
                       n_jobs = -1)                     # Use my CPU cores, everything!!!

grid_cv.fit(X_train , y_train)

In [111]:
cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_results_df[['param_max_depth', 'mean_test_score', 'mean_train_score']]

# According to mean_test_score, it looks like param_max_depth == 8 is the most appropriate

Unnamed: 0,param_max_depth,mean_test_score,mean_train_score
0,2,0.544886,0.545022
1,4,0.842634,0.899449
2,6,0.84603,0.944913
3,8,0.851477,0.982692
4,10,0.849031,0.993369
5,12,0.851073,0.997212
6,14,0.845496,0.998776
7,16,0.850393,0.999626
8,18,0.8508,0.999898
9,20,0.849169,0.999966


In [113]:
# Verification with test data

max_depths = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]

for depth in max_depths:
    dt_classifier = DecisionTreeClassifier(max_depth = depth,
                                           random_state = 0xCAFE,)
    dt_classifier.fit(X_train , y_train)
    pred = dt_classifier.predict(X_test)
    accuracy = accuracy_score(y_test , pred)
    print(f"MAX DEPTH: {depth} | ACC: {accuracy}")

MAX DEPTH: 2 | ACC: 0.5310485239226331
MAX DEPTH: 4 | ACC: 0.8096369189005769
MAX DEPTH: 6 | ACC: 0.8561248727519511
MAX DEPTH: 8 | ACC: 0.8751272480488632
MAX DEPTH: 10 | ACC: 0.8646080760095012
MAX DEPTH: 12 | ACC: 0.8669833729216152
MAX DEPTH: 14 | ACC: 0.8625721072276892
MAX DEPTH: 16 | ACC: 0.8646080760095012
MAX DEPTH: 18 | ACC: 0.8598574821852731
MAX DEPTH: 20 | ACC: 0.8598574821852731
MAX DEPTH: 22 | ACC: 0.8598574821852731
MAX DEPTH: 24 | ACC: 0.8598574821852731
MAX DEPTH: 26 | ACC: 0.8598574821852731
MAX DEPTH: 28 | ACC: 0.8598574821852731
MAX DEPTH: 30 | ACC: 0.8598574821852731


It looks like the model performs its best when `depth` is `8`.

### Random Forest Classifier

In [117]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# parameters adjusted
params = {
    'max_depth' : [6, 8, 10],
    'n_estimators' : [25, 50, 100, 200],
    'min_samples_leaf' : [8, 12],
    'min_samples_split' : [8, 12]
}

rf_classifier = RandomForestClassifier(random_state = 0xCAFE,
                                       n_jobs=-1)

grid_cv = GridSearchCV(rf_classifier,
                       param_grid = params,
                       cv = 3,
                       verbose = 10,
                       n_jobs=-1)       # adjusted
grid_cv.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [122]:
cv_results_df = pd.DataFrame(grid_cv.cv_results_)

target_col = ['rank_test_score', 'mean_test_score', 'param_n_estimators', 'param_max_depth']
cv_results_df[target_col].sort_values('rank_test_score').head()

Unnamed: 0,rank_test_score,mean_test_score,param_n_estimators,param_max_depth
35,1,0.918934,200,10
39,1,0.918934,200,10
46,3,0.91771,100,10
42,3,0.91771,100,10
26,5,0.917438,100,8


It looks like the model performs its best when `n_estimators` is `200` and `max_depth` is 200. Additionally, Random Forest Classifier generally makes better output than Decision Tree, which is generally accepted and expected.


It's also anticipated that other boosting algo