In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import joblib

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

base = "../data/Deployment_1/"

## 1. Load Data

In [2]:
df = pd.read_csv(base + "dataset_1_preprocessing_with_mean_value.csv")
df.describe()

Unnamed: 0,Hour,Minute,Label,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6,sensor8,...,sensor22,sensor23,sensor24,sensor25,sensor30,sensor31,sensor32,sensor34,sensor35,sensor40
count,140844.0,140844.0,140844.0,114551.0,114551.0,15.0,126546.0,126546.0,19180.0,108092.0,...,118019.0,118021.0,117919.0,118012.0,114125.0,9203.0,114395.0,112537.0,52274.0,115056.0
mean,11.486936,29.497444,5.413628,196.925903,92.30893,8791.283333,228.623352,87.14011,7.367738,37.363547,...,3.984462,3.784127,0.766854,4.168792,13.934131,30.26845,1.014784,66.035078,20.72838,3.350168
std,6.922467,17.318842,1.986418,21.376121,202.17439,2839.84448,7.823923,160.519101,3.686028,105.363077,...,83.331683,22.440201,5.950108,62.327809,43.849795,1.01914,7.874184,74.446204,15.121987,7.98733
min,0.0,0.0,1.0,-495.0,0.0,4924.0,205.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.5,0.0,0.0,0.0,0.0
25%,5.0,14.0,4.0,186.0,0.0,6178.25,224.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.8,29.904762,0.0,2.222222,2.0,2.0
50%,11.0,29.0,7.0,199.0,10.0,8224.5,228.833333,16.666667,6.333333,0.666667,...,0.0,0.0,0.0,0.0,1.0,30.3,0.0,3.6,32.0,2.0
75%,17.0,44.0,7.0,209.333333,84.333333,11048.625,233.833333,108.0,8.5,32.666667,...,0.0,0.0,0.0,0.0,1.272727,30.842105,0.0,121.2,33.0,2.0
max,23.0,59.0,10.0,261.833333,1170.0,13440.0,264.0,1141.0,102.25,1124.0,...,2190.181818,277.454545,50.636364,1462.083333,177.095238,32.272727,69.818182,255.4,73.0,132.166667


In [3]:
df.head()

Unnamed: 0,TimeID,Hour,Minute,Date,Label,Name,sensor5,sensor6,sensor8,sensor9,...,sensor22,sensor23,sensor24,sensor25,sensor30,sensor31,sensor32,sensor34,sensor35,sensor40
0,2013-03-04-20-58,20,58,2013-03-04,4.0,Working at PC,0.0,,66.0,,...,0.0,0.0,0.0,0.0,1.090909,,0.0,197.545455,34.181818,2.181818
1,2013-03-04-20-59,20,59,2013-03-04,4.0,Working at PC,0.0,,66.0,,...,0.0,0.0,0.0,0.0,1.0,,0.0,200.166667,32.833333,2.0
2,2013-03-04-21-00,21,0,2013-03-04,4.0,Working at PC,0.0,,64.333333,,...,0.0,0.0,0.0,0.0,0.909091,,0.0,198.083333,33.272727,2.0
3,2013-03-04-21-01,21,1,2013-03-04,4.0,Working at PC,0.0,,64.0,,...,0.0,0.0,0.0,0.0,1.090909,,0.0,199.5,33.090909,2.0
4,2013-03-04-21-02,21,2,2013-03-04,4.0,Working at PC,0.0,,64.0,,...,0.0,0.0,0.0,0.0,0.909091,,0.0,200.454545,32.181818,2.0


## 2. Data processing

In [7]:
df_train = df.copy()
df_train = df_train.fillna(0)
df_train = df_train.dropna()
df_train = df_train[df_train["Label"]!=7]
df_train = df_train.reset_index(drop=True)
df_train["Time"] = df_train["Hour"]*60 + df_train["Minute"]

In [5]:
df_train.head()

Unnamed: 0,TimeID,Hour,Minute,Date,Label,Name,sensor5,sensor6,sensor8,sensor9,...,sensor23,sensor24,sensor25,sensor30,sensor31,sensor32,sensor34,sensor35,sensor40,Time
0,2013-03-04-20-58,20,58,2013-03-04,4.0,Working at PC,0.0,0.0,66.0,0.0,...,0.0,0.0,0.0,1.090909,0.0,0.0,197.545455,34.181818,2.181818,1258
1,2013-03-04-20-59,20,59,2013-03-04,4.0,Working at PC,0.0,0.0,66.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,200.166667,32.833333,2.0,1259
2,2013-03-04-21-00,21,0,2013-03-04,4.0,Working at PC,0.0,0.0,64.333333,0.0,...,0.0,0.0,0.0,0.909091,0.0,0.0,198.083333,33.272727,2.0,1260
3,2013-03-04-21-01,21,1,2013-03-04,4.0,Working at PC,0.0,0.0,64.0,0.0,...,0.0,0.0,0.0,1.090909,0.0,0.0,199.5,33.090909,2.0,1261
4,2013-03-04-21-02,21,2,2013-03-04,4.0,Working at PC,0.0,0.0,64.0,0.0,...,0.0,0.0,0.0,0.909091,0.0,0.0,200.454545,32.181818,2.0,1262


## 3. Data splitting

In [85]:
ratio = 0.25
encoder = LabelEncoder()

X = df_train.drop(["Label", "TimeID", "Date", "Name", "Hour", "Minute"], axis=1)
X = np.array(X)
y = encoder.fit_transform(df_train["Label"])

split_idx = int(len(X)*ratio)

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(len(X_train))
print(len(X_test))

16967
50904


## 4. Training

In [86]:
kf = KFold(n_splits=4, random_state=None, shuffle=False)

In [87]:
def evaluate(model, n_fold=4, inputs=X_train, labels=y_train):
    cross_validation_set = KFold(n_splits=n_fold)
    scores = cross_val_score(model, inputs, labels, cv=cross_validation_set)
    return scores

In [88]:
def random_search(model, parameters, inputs, labels, cross_validation):
    
    search = RandomizedSearchCV(model, 
                                parameters, 
                                scoring='accuracy', 
                                refit='accuracy',
                                random_state=18, n_iter=50, 
                                verbose=3, n_jobs=-1, 
                                cv=cross_validation)
    start = time.time()
    results = search.fit(inputs, labels)
    stop = time.time()
    return results, round(stop-start, 2)

def grid_search(model, parameters, inputs, labels, cross_validation):
    search = GridSearchCV(model, 
                          parameters, 
                          scoring='accuracy', 
                          refit='accuracy',
                          verbose=3, n_jobs=-1, 
                          cv=cross_validation)

    start = time.time()
    results = search.fit(inputs, labels)
    stop = time.time()
    return results, round(stop-start, 2)

### Random forest

In [89]:
def rf_summary(results):

    accuracy = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']

    summary = {
        "Accuracy": [accuracy[i] for i in range(len(params))],
        "Rank": [results.cv_results_["rank_test_score"][i] for i in range(len(params))],
        "n_estimators": [param['n_estimators'] for param in params], 
        "max_depth": [param['max_depth'] for param in params],
        "min_samples_leaf": [param['min_samples_leaf'] for param in params],
        "min_samples_split": [param['min_samples_split'] for param in params],
        "bootstrap": [param['bootstrap'] for param in params]
    }

    df_summary = pd.DataFrame(data=summary).sort_values(by="Rank")
    df_summary = df_summary.reset_index(drop=True)
    return df_summary

In [90]:
rf_random_params = {
    "n_estimators": [int(x) for x in np.arange(100,1001,50)],
    "max_depth": [int(x) for x in np.arange(2,31,2)],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4],
    "bootstrap": [True, False]
}

rf_random_model = RandomForestClassifier(random_state=18)

rf_random_results, time_execution = random_search(rf_random_model,
                                                  rf_random_params,
                                                  X_train,
                                                  y_train,
                                                  kf) 

Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.5min finished


In [91]:
rf_random_summary = rf_summary(rf_random_results)
rf_random_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,n_estimators,max_depth,min_samples_leaf,min_samples_split,bootstrap
0,0.863973,1,350,6,2,2,False


In [92]:
joblib.dump(rf_random_results, "results/rf_randomsearch.pkl") 

['results/rf_randomsearch.pkl']

In [117]:
rf_grid_params = {
    "n_estimators": [int(x) for x in np.arange(340,371,10)],
    "max_depth": [int(x) for x in np.arange(4,7,1)],
    "min_samples_split": [2,3,4],
    "min_samples_leaf": [2,3],
    "bootstrap": [False, True]
}

rf_grid_model = RandomForestClassifier(random_state=18)

rf_grid_results, time_execution = grid_search(rf_grid_model,
                                              rf_grid_params,
                                              X_train,
                                              y_train,
                                              kf) 

Fitting 4 folds for each of 144 candidates, totalling 576 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed:  3.1min finished


In [118]:
rf_grid_summary = rf_summary(rf_grid_results)
rf_grid_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,n_estimators,max_depth,min_samples_leaf,min_samples_split,bootstrap
0,0.864444,1,360,6,2,2,False


In [119]:
joblib.dump(rf_grid_results, "results/rf_gridsearch.pkl") 

['results/rf_gridsearch.pkl']

### XGBoost

In [96]:
def xgb_summary(results):
    accuracy = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']

    summary = {
        "Accuracy": [accuracy[i] for i in range(len(params))],
        "Rank": [results.cv_results_["rank_test_score"][i] for i in range(len(params))],
        "# estimators": [param['n_estimators'] for param in params], 
        "max_depth": [param['max_depth'] for param in params],
        "learning_rate": [param['learning_rate'] for param in params],
        "colsample_bytree": [param['colsample_bytree'] for param in params],
        "subsample": [param['subsample'] for param in params],
        "gamma": [param['gamma'] for param in params]
    }

    df_summary = pd.DataFrame(data=summary).sort_values(by="Rank")
    df_summary = df_summary.reset_index(drop=True)
    return df_summary

In [97]:
xgb_random_params = {
    "n_estimators": [int(x) for x in np.arange(100,1001,50)],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 4, 5, 6],
    "subsample": [0.8, 0.9, 1],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "gamma": [0, 1, 5]
}

xgb_random_model =  XGBClassifier(random_state=18)

xgb_random_results, time_execution = random_search(xgb_random_model, 
                                                   xgb_random_params,
                                                   X_train, y_train, kf)

Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 20.5min finished


In [98]:
xgb_random_summary = xgb_summary(xgb_random_results)
xgb_random_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,# estimators,max_depth,learning_rate,colsample_bytree,subsample,gamma
0,0.84352,1,200,4,0.01,0.6,0.9,0


In [99]:
joblib.dump(xgb_random_results, "results/xgb_randomsearch.pkl") 

['results/xgb_randomsearch.pkl']

In [123]:
xgb_grid_params = {
    "n_estimators": [int(x) for x in np.arange(180,221,10)],
    "learning_rate": [0.01],
    "max_depth": [4],
    "subsample": [0.9],
    "colsample_bytree": [0.6],
    "gamma": [0]
}

xgb_grid_model =  XGBClassifier(random_state=18)

xgb_grid_results, time_execution = grid_search(xgb_grid_model, 
                                               xgb_grid_params,
                                               X_train, y_train, kf)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  20 | elapsed:   23.0s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:   25.2s remaining:   20.6s
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:   45.0s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   45.3s finished


In [124]:
xgb_grid_summary = xgb_summary(xgb_grid_results)
xgb_grid_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,# estimators,max_depth,learning_rate,colsample_bytree,subsample,gamma
0,0.84352,1,200,4,0.01,0.6,0.9,0


In [125]:
joblib.dump(xgb_grid_results, "results/xgb_gridsearch.pkl") 

['results/xgb_gridsearch.pkl']

### Support Vector Machine

In [103]:
def svm_summary(results):
    accuracy = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']

    summary = {
        "Accuracy": [accuracy[i] for i in range(len(params))],
        "Rank": [results.cv_results_["rank_test_score"][i] for i in range(len(params))],
        "C": [param['svc__C'] for param in params], 
        "kernel": [param['svc__kernel'] for param in params]
    }

    df_summary = pd.DataFrame(data=summary).sort_values(by="Rank")
    df_summary = df_summary.reset_index(drop=True)
    return df_summary

In [104]:
svc_params = {
    "svc__C": [0.01, 0.1, 1, 10],
    "svc__kernel": ["linear", "rbf"]
}

svc = Pipeline(steps=[('standardize', StandardScaler()), ('svc', SVC(cache_size=8000))])
svc_grid_search = GridSearchCV(svc, svc_params, verbose=3, n_jobs=-1, cv=kf)

svc_grid_results = svc_grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  20 out of  32 | elapsed:   14.6s remaining:    8.7s
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  1.0min finished


In [105]:
svc_grid_summary = svm_summary(svc_grid_results)
svc_grid_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,C,kernel
0,0.794134,1,1.0,rbf


In [106]:
svc_grid_summary

Unnamed: 0,Accuracy,Rank,C,kernel
0,0.794134,1,1.0,rbf
1,0.794133,2,10.0,rbf
2,0.774213,3,0.1,rbf
3,0.751172,4,0.1,linear
4,0.744275,5,1.0,linear
5,0.741152,6,0.01,linear
6,0.740797,7,10.0,linear
7,0.727478,8,0.01,rbf


In [107]:
joblib.dump(svc_grid_results, "results/svc_gridsearch.pkl") 

['results/svc_gridsearch.pkl']

### Multilayer Perceptron

In [112]:
def mlp_summary(results):
    
    accuracy = results.cv_results_['mean_test_accuracy']
    params = results.cv_results_['params']

    summary = {
        "Accuracy": [accuracy[i] for i in range(len(params))],
        "Rank": [results.cv_results_["rank_test_accuracy"][i] for i in range(len(params))],
        "hidden_layer_sizes": [param['mlp__hidden_layer_sizes'] for param in params], 
    }
    
    df_summary = pd.DataFrame(data=summary).sort_values(by="Rank")
    df_summary = df_summary.reset_index(drop=True)
    return df_summary

In [113]:
mlp_grid_params = {
    "mlp__hidden_layer_sizes": [(64), (128), (256), (512),
                                (512, 256), (512, 128), (512, 64),
                                (256, 128), (256, 64), (128, 64),
                                (512, 256, 128), (256, 128, 64)]
    }

mlp = MLPClassifier(max_iter=500, random_state=18)
mlp_pipeline = Pipeline(steps=[('normalize', MinMaxScaler()), ('mlp', mlp)])

mlp_grid_search = GridSearchCV(mlp_pipeline, 
                               mlp_grid_params, 
                               scoring=['accuracy'], 
                               refit = 'accuracy', 
                               verbose=3, n_jobs=-1, cv=kf)

mlp_grid_results = mlp_grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  42 out of  48 | elapsed: 14.4min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 17.1min finished


In [114]:
mlp_grid_summary = mlp_summary(mlp_grid_results)
mlp_grid_summary.iloc[:1,:]

Unnamed: 0,Accuracy,Rank,hidden_layer_sizes
0,0.826781,1,256


In [116]:
joblib.dump(mlp_grid_results, "results/mlp_gridsearch.pkl") 

['results/mlp_gridsearch.pkl']

## 5. Evaluation 

### Random forest

In [126]:
rf_grid_results.score(X_test, y_test)

0.8629380795222379

### XGBoost

In [127]:
xgb_grid_results.score(X_test, y_test)

0.8548640578343548

### Support Vector Machine

In [128]:
svc_grid_results.score(X_test, y_test)

0.7841034103410341

### Multilayer Perceptron

In [130]:
mlp_grid_results.score(X_test, y_test)

0.81003457488606