In [1]:
#importing all necessary libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict
from sklearn import metrics
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC, SMOTE, RandomOverSampler #using these oversampling methods, since the original data set is imbalanced
import operator
from ast import literal_eval
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

Using TensorFlow backend.


In [1]:
#loading data (file 'example_data.csv' with a separator ',')
#columns in this example: 'sensor role', 'supply chain leg', 'distance to previous generated event (leg_rel)',
#'distance to expected next event (leg_abs)', 'atmosphere temperature at current location (temp_p)', 'setpoint deviation (spd)',
#'slope of two recent measurements', 'average deviation before a triggered alarm within one hour',
#'average deviation after a triggered alarm within one next hour' (estimated with the help of random forest regressor),
#'alarm label' (target feature)
#!!! However, the continuous features for naive Bayes should be discretized with multi-interval discretization method by Fayyad and Irani (1993)
df = pd.read_csv('example_data.csv', sep = ',')
df.head(1)

In [3]:
#splitting loaded file into predictor features and a target feature
y = df['label']
X = df[['sen_role', 'sc_leg', 'leg_rel', 'leg_abs', 'temp_p', 'spd', 'slope', 'db_1h', 'da_1h']]

In [2]:
#converting all categorical variables into dummy variables and creating additional columns for this purpose
X = pd.get_dummies(X)
X.columns #printing all resulting columns (to be used for deletion of superfluous dummy features and
#declaration of lists 'featurs' and 'remaining_features' in the next steps)

In [5]:
#deleting columns that do not contain additional information (i.e., one of the columns representing each feature;
#it means that n dummy features should be deleted for n initial fatures)
X = X.drop([columns_to_drop_separated_by_comma], axis = 1) #instead of 'columns_to_drop_separated_by_comma' specify what columns should be dropped

In [6]:
#creating lists containing features (or sets of features for initially categorical features)
features = [] #the exact listing of features will depend on the number of multiple intervals found with the method Fayyad and Irani (1993); should contain all predictor features after deletion of superfluous dummy features in the previous step
remaining_features = [] #should contain the same features as in 'features'

In [3]:
#running an automated feature ranking procedure
#this chunk of code will output n, n-1, ..., and two strongest features and a standard deviation for output score
cols = []
for i in range(len(remaining_features)):
    for j in range(len(remaining_features[i])):
        cols.append(remaining_features[i][j])
                
nb = BernoulliNB(parameters) #parameters may be skipped if no additional assumptions are made
model = Pipeline([('smtnc', smtnc), ('clf', nb)])

#going through the loop and considering all feature combinations in the backward feature elimination procedure
for i in range(len(features) - 1):
    rNBbe = {} #dictionary for collecting feature names and their corresponding scores in evaluation runs
    X_new = X[cols]
    for j in range(len(remaining_features)):
        X_test = X_new.drop(remaining_features[j], axis = 1)
        acc = []
        for l in range(100):
            ac = cross_val_score(model, X_test, y, scoring = 'accuracy', cv = 10)
            for m in range(len(ac)):
                acc.append(ac[m])
        rNBbe[str(remaining_features[j])] = sum(acc)/100
    print(len(remaining_features), ' best features are: ', rNBbe, 'std is ', np.std(acc))
    to_remove = str(max(rNBbe.items(), key = operator.itemgetter(1))[0])
    remaining_features.remove(literal_eval(to_remove))
    cols = []
    for i in range(len(remaining_features)):
        for j in range(len(remaining_features[i])):
            cols.append(remaining_features[i][j])