In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import joblib

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

base = "../data/Deployment_2/"

## 1. Load Data

In [2]:
df = pd.read_csv(base + "dataset_2_preprocessing_with_mean_value.csv")
df.describe()

Unnamed: 0,Hour,Minute,Label,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6,sensor20,sensor21,sensor22,sensor24,sensor30,sensor31,sensor32,sensor40,sensor41,sensor42
count,89128.0,89128.0,89128.0,58350.0,58350.0,8926.0,61291.0,61291.0,6260.0,69669.0,69670.0,69671.0,69669.0,69673.0,69673.0,69674.0,69673.0,69673.0,69672.0
mean,11.488096,29.499686,5.143053,213.429097,75.913267,5.141496,185.655136,27.708987,3.17404,0.116427,2.141378,0.313635,0.00274,13.694658,4.545994,0.208845,0.471602,1.498302,0.50589
std,6.922033,17.31608,4.170102,23.567194,131.605657,7.261928,21.355528,61.790032,1.774513,0.62744,57.777008,11.863136,0.265854,25.971177,5.845601,1.048652,4.421275,0.224572,4.44653
min,0.0,0.0,1.0,156.0,0.0,0.0,-341.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,15.0,1.0,196.166667,0.0,2.0,177.0,0.0,2.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,1.416667,0.0
50%,11.0,29.0,2.0,209.0,18.0,2.833333,187.666667,6.0,2.666667,0.0,0.0,0.0,0.0,0.88,0.0,0.0,0.0,1.5,0.0
75%,17.0,44.0,10.0,227.5,117.2,5.333333,196.666667,30.0,3.666667,0.0,0.0,0.0,0.0,1.391304,12.0,0.0,0.0,1.6,0.0
max,23.0,59.0,11.0,305.0,1154.0,172.0,279.666667,612.0,41.25,50.772727,1894.041667,1043.913043,32.913043,70.0,14.882353,6.352941,44.25,2.0,42.083333


In [3]:
df.head()

Unnamed: 0,TimeID,Hour,Minute,Date,Label,Name,sensor1,sensor2,sensor3,sensor4,...,sensor20,sensor21,sensor22,sensor24,sensor30,sensor31,sensor32,sensor40,sensor41,sensor42
0,2013-04-14-20-15,20,15,2013-04-14,2.0,Watching TV,,,,194.0,...,0.0,0.0,0.0,0.0,5.0,12.304348,0.0,0.0,1.478261,0.0
1,2013-04-14-20-16,20,16,2013-04-14,2.0,Watching TV,,,,194.666667,...,0.0,0.0,0.0,0.0,68.333333,12.25,0.0,0.0,1.416667,0.0
2,2013-04-14-20-17,20,17,2013-04-14,2.0,Watching TV,208.0,0.0,,195.0,...,0.0,0.0,0.0,0.0,68.086957,11.956522,0.0,0.0,1.391304,0.0
3,2013-04-14-20-18,20,18,2013-04-14,2.0,Watching TV,,,,196.0,...,0.0,0.0,0.0,0.0,68.0,12.043478,2.26087,0.0,1.478261,0.0
4,2013-04-14-20-19,20,19,2013-04-14,2.0,Watching TV,,,,197.833333,...,0.0,0.0,0.0,0.0,67.913043,11.833333,5.916667,0.0,1.478261,0.0


## 2. Data processing

In [4]:
df_train = df.copy()
df_train = df_train.fillna(0)
df_train = df_train.dropna()
df_train = df_train[df_train["Label"]!=10]
df_train = df_train.reset_index(drop=True)
df_train["Time"] = df_train["Hour"]*60 + df_train["Minute"]

In [5]:
df_train.head()

Unnamed: 0,TimeID,Hour,Minute,Date,Label,Name,sensor1,sensor2,sensor3,sensor4,...,sensor21,sensor22,sensor24,sensor30,sensor31,sensor32,sensor40,sensor41,sensor42,Time
0,2013-04-14-20-15,20,15,2013-04-14,2.0,Watching TV,0.0,0.0,0.0,194.0,...,0.0,0.0,0.0,5.0,12.304348,0.0,0.0,1.478261,0.0,1215
1,2013-04-14-20-16,20,16,2013-04-14,2.0,Watching TV,0.0,0.0,0.0,194.666667,...,0.0,0.0,0.0,68.333333,12.25,0.0,0.0,1.416667,0.0,1216
2,2013-04-14-20-17,20,17,2013-04-14,2.0,Watching TV,208.0,0.0,0.0,195.0,...,0.0,0.0,0.0,68.086957,11.956522,0.0,0.0,1.391304,0.0,1217
3,2013-04-14-20-18,20,18,2013-04-14,2.0,Watching TV,0.0,0.0,0.0,196.0,...,0.0,0.0,0.0,68.0,12.043478,2.26087,0.0,1.478261,0.0,1218
4,2013-04-14-20-19,20,19,2013-04-14,2.0,Watching TV,0.0,0.0,0.0,197.833333,...,0.0,0.0,0.0,67.913043,11.833333,5.916667,0.0,1.478261,0.0,1219


## 3. Data splitting

In [25]:
ratio = 0.25
encoder = LabelEncoder()

X = df_train.drop(["Label", "TimeID", "Date", "Name", "Hour", "Minute"], axis=1)
X = np.array(X)
y = encoder.fit_transform(df_train["Label"])

split_idx = int(len(X)*ratio)

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(len(X_train))
print(len(X_test))

14143
42430


## 4. Training

In [26]:
kf = KFold(n_splits=4, random_state=None, shuffle=False)

In [27]:
def evaluate(model, n_fold=4, inputs=X_train, labels=y_train):
    cross_validation_set = KFold(n_splits=n_fold)
    scores = cross_val_score(model, inputs, labels, cv=cross_validation_set)
    return scores

In [28]:
def random_search(model, parameters, inputs, labels, cross_validation):
    
    search = RandomizedSearchCV(model, 
                                parameters, 
                                scoring='accuracy', 
                                refit='accuracy',
                                random_state=18, n_iter=50, 
                                verbose=3, n_jobs=-1, 
                                cv=cross_validation)
    start = time.time()
    results = search.fit(inputs, labels)
    stop = time.time()
    return results, round(stop-start, 2)

def grid_search(model, parameters, inputs, labels, cross_validation):
    search = GridSearchCV(model, 
                          parameters, 
                          scoring='accuracy', 
                          refit='accuracy',
                          verbose=3, n_jobs=-1, 
                          cv=cross_validation)

    start = time.time()
    results = search.fit(inputs, labels)
    stop = time.time()
    return results, round(stop-start, 2)

### Random forest

In [29]:
def rf_summary(results):

    accuracy = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']

    summary = {
        "Accuracy": [accuracy[i] for i in range(len(params))],
        "Rank": [results.cv_results_["rank_test_score"][i] for i in range(len(params))],
        "n_estimators": [param['n_estimators'] for param in params], 
        "max_depth": [param['max_depth'] for param in params],
        "min_samples_leaf": [param['min_samples_leaf'] for param in params],
        "min_samples_split": [param['min_samples_split'] for param in params],
        "bootstrap": [param['bootstrap'] for param in params]
    }

    df_summary = pd.DataFrame(data=summary).sort_values(by="Rank")
    df_summary = df_summary.reset_index(drop=True)
    return df_summary

In [30]:
rf_random_params = {
    "n_estimators": [int(x) for x in np.arange(100,1001,50)],
    "max_depth": [int(x) for x in np.arange(2,31,2)],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4],
    "bootstrap": [True, False]
}

rf_random_model = RandomForestClassifier(random_state=18)

rf_random_results, time_execution = random_search(rf_random_model,
                                                  rf_random_params,
                                                  X_train,
                                                  y_train,
                                                  kf) 

Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.3min finished


In [31]:
rf_random_summary = rf_summary(rf_random_results)
rf_random_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,n_estimators,max_depth,min_samples_leaf,min_samples_split,bootstrap
0,0.840981,1,150,4,4,2,False


In [32]:
joblib.dump(rf_random_results, "results/rf_randomsearch.pkl") 

['results/rf_randomsearch.pkl']

In [57]:
rf_grid_params = {
    "n_estimators": [int(x) for x in np.arange(130,171,10)],
    "max_depth": [int(x) for x in np.arange(3,6,1)],
    "min_samples_split": [2,3],
    "min_samples_leaf": [3,4,5],
    "bootstrap": [False, True]
}

rf_grid_model = RandomForestClassifier(random_state=18)

rf_grid_results, time_execution = grid_search(rf_grid_model,
                                              rf_grid_params,
                                              X_train,
                                              y_train,
                                              kf) 

Fitting 4 folds for each of 180 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done 500 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  1.4min finished


In [58]:
rf_grid_summary = rf_summary(rf_grid_results)
rf_grid_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,n_estimators,max_depth,min_samples_leaf,min_samples_split,bootstrap
0,0.844234,1,130,5,4,3,False


In [59]:
joblib.dump(rf_grid_results, "results/rf_gridsearch.pkl") 

['results/rf_gridsearch.pkl']

### XGBoost

In [36]:
def xgb_summary(results):
    accuracy = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']

    summary = {
        "Accuracy": [accuracy[i] for i in range(len(params))],
        "Rank": [results.cv_results_["rank_test_score"][i] for i in range(len(params))],
        "# estimators": [param['n_estimators'] for param in params], 
        "max_depth": [param['max_depth'] for param in params],
        "learning_rate": [param['learning_rate'] for param in params],
        "colsample_bytree": [param['colsample_bytree'] for param in params],
        "subsample": [param['subsample'] for param in params],
        "gamma": [param['gamma'] for param in params]
    }

    df_summary = pd.DataFrame(data=summary).sort_values(by="Rank")
    df_summary = df_summary.reset_index(drop=True)
    return df_summary

In [37]:
xgb_random_params = {
    "n_estimators": [int(x) for x in np.arange(100,1001,50)],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 4, 5, 6],
    "subsample": [0.8, 0.9, 1],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "gamma": [0, 1, 5]
}

xgb_random_model =  XGBClassifier(random_state=18)

xgb_random_results, time_execution = random_search(xgb_random_model, 
                                                   xgb_random_params,
                                                   X_train, y_train, kf)

Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 14.8min finished


In [38]:
xgb_random_summary = xgb_summary(xgb_random_results)
xgb_random_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,# estimators,max_depth,learning_rate,colsample_bytree,subsample,gamma
0,0.823093,1,200,4,0.01,0.6,0.9,0


In [39]:
joblib.dump(xgb_random_results, "results/xgb_randomsearch.pkl") 

['results/xgb_randomsearch.pkl']

In [63]:
xgb_grid_params = {
    "n_estimators": [int(x) for x in np.arange(180,221,10)],
    "learning_rate": [0.01],
    "max_depth": [4],
    "subsample": [0.9],
    "colsample_bytree": [0.6],
    "gamma": [0]
}

xgb_grid_model =  XGBClassifier(random_state=18)

xgb_grid_results, time_execution = grid_search(xgb_grid_model, 
                                               xgb_grid_params,
                                               X_train, y_train, kf)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:   25.0s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   33.1s finished


In [64]:
xgb_grid_summary = xgb_summary(xgb_grid_results)
xgb_grid_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,# estimators,max_depth,learning_rate,colsample_bytree,subsample,gamma
0,0.825285,1,190,4,0.01,0.6,0.9,0


In [65]:
joblib.dump(xgb_grid_results, "results/xgb_gridsearch.pkl") 

['results/xgb_gridsearch.pkl']

### Support Vector Machine

In [43]:
def svm_summary(results):
    accuracy = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']

    summary = {
        "Accuracy": [accuracy[i] for i in range(len(params))],
        "Rank": [results.cv_results_["rank_test_score"][i] for i in range(len(params))],
        "C": [param['svc__C'] for param in params], 
        "kernel": [param['svc__kernel'] for param in params]
    }

    df_summary = pd.DataFrame(data=summary).sort_values(by="Rank")
    df_summary = df_summary.reset_index(drop=True)
    return df_summary

In [44]:
svc_params = {
    "svc__C": [0.01, 0.1, 1, 10],
    "svc__kernel": ["linear", "rbf"]
}

svc = Pipeline(steps=[('standardize', StandardScaler()), ('svc', SVC(cache_size=8000))])
svc_grid_search = GridSearchCV(svc, svc_params, verbose=3, n_jobs=-1, cv=kf)

svc_grid_results = svc_grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:   16.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:   16.1s finished


In [45]:
svc_grid_summary = svm_summary(svc_grid_results)
svc_grid_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,C,kernel
0,0.828325,1,0.01,linear


In [46]:
svc_grid_summary

Unnamed: 0,Accuracy,Rank,C,kernel
0,0.828325,1,0.01,linear
1,0.810366,2,0.1,linear
2,0.808881,3,1.0,linear
3,0.803436,4,0.1,rbf
4,0.802729,5,1.0,rbf
5,0.797143,6,10.0,rbf
6,0.790285,7,10.0,linear
7,0.756204,8,0.01,rbf


In [47]:
joblib.dump(svc_grid_results, "results/svc_gridsearch.pkl") 

['results/svc_gridsearch.pkl']

### Multilayer Perceptron

In [52]:
def mlp_summary(results):
    
    accuracy = results.cv_results_['mean_test_accuracy']
    params = results.cv_results_['params']

    summary = {
        "Accuracy": [accuracy[i] for i in range(len(params))],
        "Rank": [results.cv_results_["rank_test_accuracy"][i] for i in range(len(params))],
        "hidden_layer_sizes": [param['mlp__hidden_layer_sizes'] for param in params], 
    }
    
    df_summary = pd.DataFrame(data=summary).sort_values(by="Rank")
    df_summary = df_summary.reset_index(drop=True)
    return df_summary

In [53]:
mlp_grid_params = {
    "mlp__hidden_layer_sizes": [(64), (128), (256), (512),
                                (512, 256), (512, 128), (512, 64),
                                (256, 128), (256, 64), (128, 64),
                                (512, 256, 128), (256, 128, 64)]
    }

mlp = MLPClassifier(max_iter=500, random_state=18)
mlp_pipeline = Pipeline(steps=[('normalize', MinMaxScaler()), ('mlp', mlp)])

mlp_grid_search = GridSearchCV(mlp_pipeline, 
                               mlp_grid_params, 
                               scoring=['accuracy'], 
                               refit = 'accuracy', 
                               verbose=3, n_jobs=-1, cv=kf)

mlp_grid_results = mlp_grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 10.4min finished


In [54]:
mlp_grid_summary = mlp_summary(mlp_grid_results)
mlp_grid_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,hidden_layer_sizes
0,0.784063,1,128


In [56]:
joblib.dump(mlp_grid_results, "results/mlp_gridsearch.pkl") 

['results/mlp_gridsearch.pkl']

## 5. Evaluation

### Random forest

In [66]:
rf_grid_results.score(X_test, y_test)

0.8029695969832665

### XGBoost

In [67]:
xgb_grid_results.score(X_test, y_test)

0.8817110534998822

### Support Vector Machine

In [68]:
svc_grid_results.score(X_test, y_test)

0.7692198915861419

### Multilayer Perceptron

In [70]:
mlp_grid_results.score(X_test, y_test)

0.7904312986094745