In [1]:
#importing all necessary libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict
from sklearn import metrics
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC, RandomOverSampler #using this oversampling method, since the original data set is imbalanced
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

Using TensorFlow backend.


In [1]:
#loading data (file 'example_data.csv' with a separator ';')
#columns in this example: 'sensor role', 'supply chain leg', 'distance to previous generated event (leg_rel)',
#'distance to expected next event (leg_abs)', 'atmosphere temperature at current location (temp_p)', 'setpoint deviation (spd)',
#'slope of two recent measurements', 'average deviation before a triggered alarm within one hour',
#'average deviation after a triggered alarm within one next hour' (estimated with the help of random forest regressor),
#'lower threshold', 'higher threshold', 'alarm label' (target feature)
df = pd.read_csv('example_data.csv', sep = ';')

In [3]:
#rescaling ambient temperature in terms of setpoint deviation units
#df.iloc[i, 10] corresponds to higher threshold, df.iloc[i, 9] - to lower, and df.iloc[i, 4] contains the initial unscaled ambient temperature value
for i in range(len(df)):
    middle = (df.iloc[i, 10] - df.iloc[i, 9])/2 + df.iloc[i, 9]
    span = (df.iloc[i, 10] - df.iloc[i, 9])/2
    if df.iloc[i, 4] > middle:
        df.iloc[i, 4] = (df.iloc[i, 4] - middle)/span
    elif df.iloc[i, 4] < middle:
        df.iloc[i, 4] = (df.iloc[i, 4] - middle)/span
    else:
        df.iloc[i, 4] = 0

Features for tree-based algorithms (do not require normalization)

In [4]:
#using only needed columns for predictor features and a target feature
y = df['label']
X = df[['sen_role', 'sc_leg', 'leg_rel', 'leg_abs', 'temp_p', 'spd', 'slope', 'db_1h', 'da_1h']]

In [2]:
#encoding categorical features as dummy features
enc = OrdinalEncoder()
X.sen_role = enc.fit_transform(X.sen_role.values.reshape(-1, 1))
X.sc_leg = enc.fit_transform(X.sc_leg.values.reshape(-1, 1))

Features for otheralgorithms (except naive Bayes) that require normalization

In [25]:
#using only needed columns for predictor features and a target feature
yo = df['label']
Xo = df[['sen_role', 'sc_leg', 'leg_rel', 'leg_abs', 'temp_p', 'spd', 'slope', 'db_1h', 'da_1h']]

In [26]:
#converting categorical variables into dummy variables and creating additional columns for this purpose
Xo = pd.get_dummies(Xo)

In [3]:
#checking whether conversion went without errors
Xo.head()

In [28]:
#deleting columns that do not contain additional information
Xo = Xo.drop(['sen_role_AMB', 'sc_leg_e'], axis = 1) #the exact names may differ depending on the naming convention in a different data set

Numeric feature normalization

In [30]:
features = ['leg_rel', 'leg_abs', 'temp_p', 'spd', 'slope', 'db_1h', 'da_1h'] #the exact names may differ depending on the naming convention in a different data set
for i in range(7):
    for j in range(len(Xo)):
        minimum = min(Xo[features[i]])
        maximum = max(Xo[features[i]])
        Xo.iloc[j, i] = (Xo.iloc[j, i] - minimum)/(maximum - minimum)

In [4]:
#checking the results of normalization and categorical encoding as dummy features
Xo.head()

Searching for the best hyperparameter combination for each classifier and collecting scores for evaluation metrics

k-NN

In [12]:
#lists for collecting scores of evaluation metrics
knn_acc = []
knn_auc = []
knn_pr = []
knn_rec = []
knn_gin = []

#classifier instance initialization
knn = KNeighborsClassifier()

#creating an instance of oversampler and incorporating it into a pipeline
smtnc = SMOTENC(categorical_features = [7, 8, 9, 10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', knn)])

#specifying a domain of hyperparameter values and running a grid search over them
params = {'clf__n_neighbors': [your_range]} #as 'your_range' specify all possible values of k to search among; all values to 100 have been used in the project
grid = GridSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)

#extracting the value(s) of the optimal hyperparameter(s)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
    
#new instance of classifier to be executed with the optimal hyperparameter(s)    
knn_new = KNeighborsClassifier()
knn_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', knn_new)])

#collection of average results (evaluation metric values)
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        knn_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        knn_auc.append(k)
        knn_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        knn_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        knn_rec.append(k)

In [5]:
#value(s) of the optimal hyperparameter(s)
final

In [6]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(knn_acc)/len(knn_acc), 'acc (std): ', np.std(knn_acc),
      '\nauc (mean): ', sum(knn_auc)/len(knn_auc), 'auc (std): ', np.std(knn_auc),
      '\ngin (mean): ', sum(knn_gin)/len(knn_gin), 'gin (std): ', np.std(knn_gin),
     '\npr (mean): ', sum(knn_pr)/len(knn_pr), 'pr (std): ', np.std(knn_pr),
     '\nrec (mean): ', sum(knn_rec)/len(knn_rec), 'rec (std): ', np.std(knn_rec))

Random forest

In [15]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
rf_acc = []
rf_auc = []
rf_pr = []
rf_rec = []
rf_gin = []
rf = RandomForestClassifier()
smtnc = SMOTENC(categorical_features = [7, 8, 9, 10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', rf)])
params = {'clf__n_estimators': [your_range],
          'clf__max_features': (your_values), 'clf__max_depth': [your_range],
          'clf__min_samples_leaf': [your_range]} #refer to the documentation on scikit-learn with regard to all possible hyperparameters; in this project, these hyperparameters have been tuned
grid = GridSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
rf_new = RandomForestClassifier()
rf_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', rf_new)])
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        rf_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        rf_auc.append(k)
        rf_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        rf_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        rf_rec.append(k)

In [7]:
#value(s) of the optimal hyperparameter(s)
final

In [8]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(rf_acc)/len(rf_acc), 'acc (std): ', np.std(rf_acc),
      '\nauc (mean): ', sum(rf_auc)/len(rf_auc), 'auc (std): ', np.std(rf_auc),
      '\ngin (mean): ', sum(rf_gin)/len(rf_gin), 'gin (std): ', np.std(rf_gin),
     '\npr (mean): ', sum(rf_pr)/len(rf_pr), 'pr (std): ', np.std(rf_pr),
     '\nrec (mean): ', sum(rf_rec)/len(rf_rec), 'rec (std): ', np.std(rf_rec))

Support vector machine

In [32]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
svm_acc = []
svm_auc = []
svm_pr = []
svm_rec = []
svm_gin = []
svm = SVC()
smtnc = SMOTENC(categorical_features = [7, 8, 9, 10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', svm)])
params = {'clf__C': [your_range], 'clf__kernel': (your_values),
          'clf__gamma': [your_range], 'clf__coef0': [your_range], 'clf__degree': [your_range]} #refer to the documentation on scikit-learn with regard to all possible hyperparameters; in this project, these hyperparameters have been tuned
grid = RandomizedSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
svm_new = SVC()
svm_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', svm_new)])
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        svm_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        svm_auc.append(k)
        svm_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        svm_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        svm_rec.append(k)

In [9]:
#value(s) of the optimal hyperparameter(s)
final

In [10]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(svm_acc)/len(svm_acc), 'acc (std): ', np.std(svm_acc),
      '\nauc (mean): ', sum(svm_auc)/len(svm_auc), 'auc (std): ', np.std(svm_auc),
      '\ngin (mean): ', sum(svm_gin)/len(svm_gin), 'gin (std): ', np.std(svm_gin),
     '\npr (mean): ', sum(svm_pr)/len(svm_pr), 'pr (std): ', np.std(svm_pr),
     '\nrec (mean): ', sum(svm_rec)/len(svm_rec), 'rec (std): ', np.std(svm_rec))

AdaBoost

In [11]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
abc_acc = []
abc_auc = []
abc_pr = []
abc_rec = []
abc_gin = []
abc = AdaBoostClassifier()
smtnc = SMOTENC(categorical_features = [7,8,9,10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', abc)])
params = {'clf__base_estimator': your_estimators_with_hyperparameters),
          'clf__n_estimators': [your_range],
          'clf__learning_rate': [your_range]} #refer to the documentation on scikit-learn with regard to all possible hyperparameters; in this project, these hyperparameters have been tuned
grid = GridSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
abc_new = AdaBoostClassifier()
abc_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', abc_new)])
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        abc_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        abc_auc.append(k)
        abc_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        abc_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        abc_rec.append(k)

In [12]:
#value(s) of the optimal hyperparameter(s)
final

In [13]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(abc_acc)/len(abc_acc), 'acc (std): ', np.std(abc_acc),
      '\nauc (mean): ', sum(abc_auc)/len(abc_auc), 'auc (std): ', np.std(abc_auc),
      '\ngin (mean): ', sum(abc_gin)/len(abc_gin), 'gin (std): ', np.std(abc_gin),
     '\npr (mean): ', sum(abc_pr)/len(abc_pr), 'pr (std): ', np.std(abc_pr),
     '\nrec (mean): ', sum(abc_rec)/len(abc_rec), 'rec (std): ', np.std(abc_rec))

Multilayer perceptron

In [16]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
mlp_acc = []
mlp_auc = []
mlp_pr = []
mlp_rec = []
mlp_gin = []
mlp = MLPClassifier()
smtnc = SMOTENC(categorical_features = [7,8,9,10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', mlp)])
params = {'clf__hidden_layer_sizes': (your_architectures),
          'clf__activation': (your_values), 'solver': (your_values),
          'clf__alpha': [your_range],
          'clf__learning_rate': (your_values),
         'clf__learning_rate_init': [your_range]} #refer to the documentation on scikit-learn with regard to all possible hyperparameters; in this project, these hyperparameters have been tuned
grid = GridSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
mlp_new = MLPClassifier()
mlp_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', mlp_new)])
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        mlp_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        mlp_auc.append(k)
        mlp_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        mlp_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        mlp_rec.append(k)

In [14]:
#value(s) of the optimal hyperparameter(s)
final

In [15]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(mlp_acc)/len(mlp_acc), 'acc (std): ', np.std(mlp_acc),
      '\nauc (mean): ', sum(mlp_auc)/len(mlp_auc), 'auc (std): ', np.std(mlp_auc),
      '\ngin (mean): ', sum(mlp_gin)/len(mlp_gin), 'gin (std): ', np.std(mlp_gin),
     '\npr (mean): ', sum(mlp_pr)/len(mlp_pr), 'pr (std): ', np.std(mlp_pr),
     '\nrec (mean): ', sum(mlp_rec)/len(mlp_rec), 'rec (std): ', np.std(mlp_rec))

Logistic regression

In [22]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
lr_acc = []
lr_auc = []
lr_pr = []
lr_rec = []
lr_gin = []
lr = LogisticRegression()
smtnc = SMOTENC(categorical_features = [7,8,9,10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', lr)])
params = {'clf__penalty': (your_values),
          'clf__C': [your_range],
          'clf__max_iter': [your_values]} #refer to the documentation on scikit-learn with regard to all possible hyperparameters; in this project, these hyperparameters have been tuned
grid = GridSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
lr_new = LogisticRegression()
lr_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', lr_new)])
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        lr_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        lr_auc.append(k)
        lr_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        lr_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        lr_rec.append(k)

In [17]:
#value(s) of the optimal hyperparameter(s)
final

In [18]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(lr_acc)/len(lr_acc), 'acc (std): ', np.std(lr_acc),
      '\nauc (mean): ', sum(lr_auc)/len(lr_auc), 'auc (std): ', np.std(lr_auc),
      '\ngin (mean): ', sum(lr_gin)/len(lr_gin), 'gin (std): ', np.std(lr_gin),
     '\npr (mean): ', sum(lr_pr)/len(lr_pr), 'pr (std): ', np.std(lr_pr),
     '\nrec (mean): ', sum(lr_rec)/len(lr_rec), 'rec (std): ', np.std(lr_rec))

Decision tree

In [19]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
dt_acc = []
dt_auc = []
dt_pr = []
dt_rec = []
dt_gin = []
dt = DecisionTreeClassifier()
smtnc = SMOTENC(categorical_features = [7,8,9,10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', dt)])
params = {'clf__splitter': (your_values), 'clf__max_depth': [your_range],
          'clf__min_samples_split': [your_range],
          'clf__max_features': (your_values)} #refer to the documentation on scikit-learn with regard to all possible hyperparameters; in this project, these hyperparameters have been tuned
grid = GridSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
dt_new = DecisionTreeClassifier()
dt_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', dt_new)])
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        dt_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        dt_auc.append(k)
        dt_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        dt_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        dt_rec.append(k)

In [20]:
#value(s) of the optimal hyperparameter(s)
final

In [21]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(dt_acc)/len(dt_acc), 'acc (std): ', np.std(dt_acc),
      '\nauc (mean): ', sum(dt_auc)/len(dt_auc), 'auc (std): ', np.std(dt_auc),
      '\ngin (mean): ', sum(dt_gin)/len(dt_gin), 'gin (std): ', np.std(dt_gin),
     '\npr (mean): ', sum(dt_pr)/len(dt_pr), 'pr (std): ', np.std(dt_pr),
     '\nrec (mean): ', sum(dt_rec)/len(dt_rec), 'rec (std): ', np.std(dt_rec))

Bagging

In [22]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
bag_acc = []
bag_auc = []
bag_pr = []
bag_rec = []
bag_gin = []
bag = BaggingClassifier()
smtnc = SMOTENC(categorical_features = [7,8,9,10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', bag)])
params = {'clf__n_estimators': [your_range],
          'clf__max_samples': [your_range],
          'clf__max_features': [your_range]} #refer to the documentation on scikit-learn with regard to all possible hyperparameters; in this project, these hyperparameters have been tuned
grid = GridSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
bag_new = BaggingClassifier()
bag_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', bag_new)])
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        bag_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        bag_auc.append(k)
        bag_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        bag_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        bag_rec.append(k)

In [24]:
#value(s) of the optimal hyperparameter(s)
final

In [25]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(bag_acc)/len(bag_acc), 'acc (std): ', np.std(bag_acc),
      '\nauc (mean): ', sum(bag_auc)/len(bag_auc), 'auc (std): ', np.std(bag_auc),
      '\ngin (mean): ', sum(bag_gin)/len(bag_gin), 'gin (std): ', np.std(bag_gin),
     '\npr (mean): ', sum(bag_pr)/len(bag_pr), 'pr (std): ', np.std(bag_pr),
     '\nrec (mean): ', sum(bag_rec)/len(bag_rec), 'rec (std): ', np.std(bag_rec))

Gradient boosting

In [26]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
gb_acc = []
gb_auc = []
gb_pr = []
gb_rec = []
gb_gin = []
gb = GradientBoostingClassifier()
smtnc = SMOTENC(categorical_features = [7,8,9,10], sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
model = Pipeline([('smtnc', smtnc), ('clf', gb)])
params = {'clf__loss': (your_values), 'clf__learning_rate': [your_range],
         'clf__n_estimators': [your_range],
         'clf__max_depth': [your_range],
          'clf__min_samples_split': [your_range]} #refer to the documentation on scikit-learn with regard to all possible hyperparameters; in this project, these hyperparameters have been tuned
grid = GridSearchCV(model, params, cv = 10, scoring = 'accuracy', n_jobs = -1)
grid.fit(Xo, yo)
res = grid.best_params_
in_par = []
for key, value in res.items():
    temp = [key, value]
    in_par.append(temp)
final = {}
for i in in_par:
    raw_parameter = i[0]
    value = i[1]
    parameter = raw_parameter[5:]
    final[parameter] = value
gb_new = GradientBoostingClassifier()
gb_new.set_params(**final)
model_new = Pipeline([('smtnc', smtnc), ('clf', gb_new)])
for j in range(100):
    acc = cross_val_score(model_new, Xo, yo, scoring = 'accuracy', cv = 10)
    for k in acc:
        gb_acc.append(k)
    auc = cross_val_score(model_new, Xo, yo, scoring = 'roc_auc', cv = 10)
    for k in auc:
        gb_auc.append(k)
        gb_gin.append(2*k - 1)
    pr = cross_val_score(model_new, Xo, yo, scoring = 'precision', cv = 10)
    for k in pr:
        gb_pr.append(k)
    rec = cross_val_score(model_new, Xo, yo, scoring = 'recall', cv = 10)
    for k in rec:
        gb_rec.append(k)

In [27]:
#value(s) of the optimal hyperparameter(s)
final

In [28]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(gb_acc)/len(gb_acc), 'acc (std): ', np.std(gb_acc),
      '\nauc (mean): ', sum(gb_auc)/len(gb_auc), 'auc (std): ', np.std(gb_auc),
      '\ngin (mean): ', sum(gb_gin)/len(gb_gin), 'gin (std): ', np.std(gb_gin),
     '\npr (mean): ', sum(gb_pr)/len(gb_pr), 'pr (std): ', np.std(gb_pr),
     '\nrec (mean): ', sum(gb_rec)/len(gb_rec), 'rec (std): ', np.std(gb_rec))

Naïve Bayes

In [29]:
#loading data (file 'example_data.csv' with a separator ',')
#columns in this example: 'sensor role', 'supply chain leg', 'distance to previous generated event (leg_rel)',
#'distance to expected next event (leg_abs)', 'atmosphere temperature at current location (temp_p)', 'setpoint deviation (spd)',
#'slope of two recent measurements', 'average deviation before a triggered alarm within one hour',
#'average deviation after a triggered alarm within one next hour' (estimated with the help of random forest regressor),
#'alarm label' (target feature)
#!!! However, the continuous features for naive Bayes should be discretized with multi-interval discretization method by Fayyad and Irani (1993)
dfd = pd.read_csv('example_data.csv', sep = ',')

In [30]:
#using only needed columns for predictor features and a target feature
yonb = dfd['label']
Xonb = dfd[['sen_role', 'sc_leg', 'leg_rel', 'leg_abs', 'temp_p', 'spd', 'slope', 'db_1h', 'da_1h']]

In [31]:
#converting categorical variables into dummy variables and creating additional columns for this purpose
Xonb = pd.get_dummies(Xonb)
Xonb.columns #printing all resulting columns (to be used for deletion of superfluous dummy features and
#declaration of lists 'featurs' and 'remaining_features' in the next steps)

In [32]:
#deleting columns that do not contain additional information (i.e., one of the columns representing each feature;
#it means that n dummy features should be deleted for n initial fatures)
Xonb = Xonb.drop([columns_to_drop_separated_by_comma], axis = 1) #instead of 'columns_to_drop_separated_by_comma' specify what columns should be dropped

In [33]:
#the procedure is analogical to that with k-NN above, therefore no explanations will be given for parts of code
bnb_acc = []
bnb_auc = []
bnb_pr = []
bnb_rec = []
bnb_gin = []
bnb = BernoulliNB(parameters) #parameters may be skipped if no additional assumptions are made
smtnc = RandomOverSampler(sampling_strategy = 0.5, random_state = 0)
model = Pipeline([('smtnc', smtnc), ('clf', bnb)])
for j in range(100):
    acc = cross_val_score(bnb, Xonb, yonb, scoring = 'accuracy', cv = 10)
    for k in acc:
        bnb_acc.append(k)
    auc = cross_val_score(model, Xonb, yonb, scoring = 'roc_auc', cv = 10)
    for k in auc:
        bnb_auc.append(k)
        bnb_gin.append(2*k - 1)
    pr = cross_val_score(model, Xonb, yonb, scoring = 'precision', cv = 10)
    for k in pr:
        bnb_pr.append(k)
    rec = cross_val_score(model, Xonb, yonb, scoring = 'recall', cv = 10)
    for k in rec:
        bnb_rec.append(k)

In [34]:
#printing scores of evaluation metrics and their standard deviations
print('acc (mean): ', sum(bnb_acc)/len(bnb_acc), 'acc (std): ', np.std(bnb_acc),
      '\nauc (mean): ', sum(bnb_auc)/len(bnb_auc), 'auc (std): ', np.std(bnb_auc),
      '\ngin (mean): ', sum(bnb_gin)/len(bnb_gin), 'gin (std): ', np.std(bnb_gin),
     '\npr (mean): ', sum(bnb_pr)/len(bnb_pr), 'pr (std): ', np.std(bnb_pr),
     '\nrec (mean): ', sum(bnb_rec)/len(bnb_rec), 'rec (std): ', np.std(bnb_rec))