## Packages Preparation

In [1]:
import os
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn import  preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,f1_score,accuracy_score
import timeit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold,StratifiedKFold,ShuffleSplit,StratifiedShuffleSplit

## Functions Preparation

In [2]:
def win_seg(data,windowsize,overlap):#function for overlap segmentation
    length=int((data.shape[0]*data.shape[1]-windowsize)/(windowsize*overlap)+1)
    newdata=np.empty((length,windowsize, data.shape[2],1))
    data_dim=data.shape[2]
    layers=data.shape[3]
    data=data.reshape(-1,data_dim,layers)
    for i in range(0,length) :
        start=int(i*windowsize*overlap)
        end=int(start+windowsize)
        newdata[i]=data[start:end]
    return newdata
def lab_vote(data,windowsize):
    y_data=data.reshape(-1,windowsize,1,1)
    y_data=win_seg(y_data,windowsize,0.5)
    y_data=y_data.reshape(y_data.shape[0],y_data.shape[1],y_data.shape[2])
    y_data=stats.mode(y_data,axis=1)
    y_data=y_data.mode
    y_data=y_data.reshape(-1,1)
    y_data=np.float64(keras.utils.to_categorical(y_data))
    return y_data
def lab_vote_cat(data,windowsize): # non one-hot coding
    y_data=data.reshape(-1,windowsize,1,1)
    y_data=win_seg(y_data,windowsize,0.5)
    y_data=y_data.reshape(y_data.shape[0],y_data.shape[1],y_data.shape[2])
    y_data=stats.mode(y_data,axis=1)
    y_data=y_data.mode
    y_data=y_data.reshape(-1,1)
    return y_data
def preparation(dataset):
    x_data=preprocessing.scale(pd.read_csv(dataset).iloc[:,1:]) #Column-wise normalization
    y_data=pd.read_csv(dataset).iloc[:,0]
    X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.20, random_state=42)#split the data into train and test by 8:2
    return X_train, X_test, x_data,y_train, y_test, y_data
def TrainModels_Tuned(X_train, X_test, y_train, y_test,
                      svm_c,
                      svm_gamma,
                      svm_kernel,
                      svm_degree,
                      nb_var,
                      knn_k,
                      dt_depth,
                      dt_split,
                      dt_leaf,
                      rf_trees
                     ):
    # Time cost
    train_time=[]
    run_time=[]
    #SVM
    svm=SVC(gamma=svm_gamma, C=svm_c, kernel=svm_kernel, degree=svm_degree, random_state=42)
    start = timeit.default_timer()
    svm.fit(X_train,y_train)
    stop = timeit.default_timer()
    train_time.append(stop - start)
    start = timeit.default_timer()
    svm_pre=pd.DataFrame(data=svm.predict(X_test))
    stop = timeit.default_timer()
    run_time.append(stop - start)
    #Naive Bayes
    nb=GaussianNB(var_smoothing=nb_var)
    start = timeit.default_timer()
    nb.fit(X_train,y_train)
    stop = timeit.default_timer()
    train_time.append(stop - start)
    start = timeit.default_timer()
    nb_pre=pd.DataFrame(data=nb.predict(X_test))
    stop = timeit.default_timer()
    run_time.append(stop - start)
    #KNN
    knn=KNeighborsClassifier(n_neighbors=knn_k) # based on a simple grid search
    start = timeit.default_timer()
    knn.fit(X_train,y_train)
    stop = timeit.default_timer()
    train_time.append(stop - start)
    start = timeit.default_timer()
    knn_pre=pd.DataFrame(data=knn.predict(X_test))
    stop = timeit.default_timer()
    run_time.append(stop - start)
    
    #Decision Tree
    dt=dt=DecisionTreeClassifier(max_depth=dt_depth, min_samples_leaf=dt_leaf, min_samples_split=dt_split,random_state=42)
    start = timeit.default_timer()
    dt.fit(X_train,y_train)
    stop = timeit.default_timer()
    train_time.append(stop - start)
    start = timeit.default_timer()
    dt_pre= pd.DataFrame(data=dt.predict(X_test))
    stop = timeit.default_timer()
    run_time.append(stop - start)
    
    #Random Forest
    rf=RandomForestClassifier(n_estimators=rf_trees)
    start = timeit.default_timer()
    rf.fit(X_train,y_train)
    stop = timeit.default_timer()
    train_time.append(stop - start)
    start = timeit.default_timer()
    rf_pre=pd.DataFrame(data=rf.predict(X_test))
    stop = timeit.default_timer()
    run_time.append(stop - start)
    
    #Ensemble
    result=pd.concat([svm_pre,nb_pre, knn_pre,dt_pre,rf_pre], axis=1)
    result.columns=['svm_pre','nb_pre', 'knn_pre','dt_pre','rf_pre']
    result['Ensemble']=result.mode(axis='columns').iloc[:,0]
    modelnames=list(result.columns)
    #classnames=list(set(y_test))
    #report=classification_report(y_test, svm_pre, target_names=classnames) #general report
    report = pd.DataFrame(columns=['Models','Accuracy','Macro F1','Micro F1','Train Time','Run Time'])
    report['Models']=modelnames
    for i in range(len(result.columns)):
        report.iloc[i,1]=accuracy_score(y_test, result.iloc[:,i])
        report.iloc[i,2]=f1_score(y_test, result.iloc[:,i],average='macro')
        report.iloc[i,3]=f1_score(y_test, result.iloc[:,i],average='micro')
        if i<len(train_time):
            report.iloc[i,4]=train_time[i]
            report.iloc[i,5]=run_time[i]
    return report

def svc_param_selection(X, y, nfolds):
    Cs = [2**i for i in [-10, -5, -1, 0, 1, 5, 10]]
    gammas =[2**i for i in [-10, -5, -1, 0, 1, 5, 10]]
    kernels = ['poly', 'rbf']
    degrees = [2, 3, 4, 5]
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':kernels, 'degree': degrees}
    com=1
    for x in param_grid.values():
        com *= len(x)
    print('There are {} combinations'.format(com))
    grid_search = GridSearchCV(SVC(), param_grid, cv=nfolds, scoring='f1_macro')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
def NB_param_selection(X, y, nfolds):
    param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}
    com=1
    for x in param_grid.values():
        com *= len(x)
    print('There are {} combinations'.format(com))
    grid_search = GridSearchCV(GaussianNB(), param_grid, cv=nfolds, scoring='f1_macro')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
def KNN_param_selection(X, y, nfolds):
    K = [i+1 for i in range(100)]
    param_grid = {'n_neighbors': K}
    com=1
    for x in param_grid.values():
        com *= len(x)
    print('There are {} combinations'.format(com))
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=nfolds, scoring='f1_macro')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
def DT_param_selection(X, y, nfolds):
    max_depths = [i*5 for i in range(1,7)]
    min_samples_splits = [i*5 for i in range(1,7)]
    min_samples_leafs = [i*5 for i in range(1,7)]
    param_grid = {#'ccp_alpha': ccp_alphas,
                  'max_depth': max_depths,
                  'min_samples_split': min_samples_splits,
                  'min_samples_leaf': min_samples_leafs}
    com=1
    for x in param_grid.values():
        com *= len(x)
    print('There are {} combinations'.format(com))
    grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=nfolds, scoring='f1_macro')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
def RF_param_selection(X, y, nfolds):
    n = [i*5 for i in range(1,81)]
    param_grid = {
    'n_estimators': n
    }
    com=1
    for x in param_grid.values():
        com *= len(x)
    print('There are {} combinations'.format(com))
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=nfolds, scoring='f1_macro')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

## Fine-tuning the ML Models

In [None]:
os.chdir("...") #changing working directory
X_train, X_test, X_data,y_train, y_test, y_data=preparation("...")
SRS = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=42) # Stratified Shuffle
SS_val= ShuffleSplit(n_splits=1, test_size=0.2, random_state=24) # random sample
finalreport = pd.DataFrame(columns=['Models','Accuracy','Macro F1','Micro F1','Train Time','Run Time'])
svm_parameters=[]
nb_parameters=[]
knn_parameters=[]
dt_parameters=[]
rf_parameters=[]
for train_index, test_index in SRS.split(X_data, y_data):
    # train/test spit
    X_train, X_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]
    # fine tuning on validate and save the parameters & save parameters
    svm_tuned = svc_param_selection(X_train, y_train, SS_val)
    nb_tuned = NB_param_selection(X_train, y_train, SS_val)
    knn_tuned = KNN_param_selection(X_train, y_train, SS_val)
    dt_tuned = DT_param_selection(X_train, y_train, SS_val)
    rf_tuned = RF_param_selection(X_train, y_train, SS_val)
    svm_parameters.append(svm_tuned)
    nb_parameters.append(nb_tuned)
    knn_parameters.append(knn_tuned)
    dt_parameters.append(dt_tuned)
    rf_parameters.append(rf_tuned)
    # get the configured model for test
    finalreport=finalreport.append(TrainModels_Tuned(X_train, X_test, y_train, y_test,
                      svm_c=svm_tuned['C'],
                      svm_gamma=svm_tuned['gamma'],
                      svm_kernel=svm_tuned['kernel'],
                      svm_degree=svm_tuned['degree'],nb_var=nb_tuned['var_smoothing'],
                      knn_k=knn_tuned['n_neighbors'],
                      dt_depth=dt_tuned['max_depth'],
                      dt_split=dt_tuned['min_samples_split'],
                      dt_leaf=dt_tuned['min_samples_leaf'],
                      rf_trees=rf_tuned['n_estimators']
                     ))
finalreport.to_csv("...")