In [1]:
#importing all necessary libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict
from sklearn import metrics
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC, SMOTE, RandomOverSampler #using these oversampling methods, since the original data set is imbalanced
import operator
from ast import literal_eval
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

Using TensorFlow backend.


In [1]:
#loading data (file 'example_data.csv' with a separator ';')
#columns in this example: 'sensor role', 'supply chain leg', 'distance to previous generated event (leg_rel)',
#'distance to expected next event (leg_abs)', 'atmosphere temperature at current location (temp_p)', 'setpoint deviation (spd)',
#'slope of two recent measurements', 'average deviation before a triggered alarm within one hour',
#'average deviation after a triggered alarm within one next hour' (estimated with the help of random forest regressor),
#'lower threshold', 'higher threshold', 'alarm label' (target feature)
df = pd.read_csv('example_data.csv', sep = ';')
df.head(1)

In [3]:
#rescaling ambient temperature in terms of setpoint deviation units
#df.iloc[i, 10] corresponds to higher threshold, df.iloc[i, 9] - to lower, and df.iloc[i, 4] contains the initial unscaled ambient temperature value
for i in range(len(df)):
    middle = (df.iloc[i, 10] - df.iloc[i, 9])/2 + df.iloc[i, 9]
    span = (df.iloc[i, 10] - df.iloc[i, 9])/2
    if df.iloc[i, 4] > middle:
        df.iloc[i, 4] = (df.iloc[i, 4] - middle)/span
    elif df.iloc[i, 4] < middle:
        df.iloc[i, 4] = (df.iloc[i, 4] - middle)/span
    else:
        df.iloc[i, 4] = 0

In [4]:
#using only needed columns for predictor features and a target feature
y = df['label']
X = df[['sen_role', 'sc_leg', 'leg_rel', 'leg_abs', 'temp_p', 'spd', 'slope', 'db_1h', 'da_1h']]

In [5]:
#converting categorical variables into dummy variables and creating additional columns for this purpose
X = pd.get_dummies(X)

In [6]:
#deleting columns that do not contain additional information
X = X.drop(['sen_role_AMB', 'sc_leg_e'], axis = 1)

In [2]:
#checking whether conversion went without errors
X.head()

In [8]:
#creating lists containing features (or sets of features for initially categorical features)
features = [['leg_rel'], ['leg_abs'], ['temp_p'], ['spd'], ['slope'], ['db_1h'], ['da_1h'], ['sen_role_REG'],
            ['sc_leg_f', 'sc_leg_h', 'sc_leg_p']]
remaining_features = [['leg_rel'], ['leg_abs'], ['temp_p'], ['spd'], ['slope'], ['db_1h'], ['da_1h'], ['sen_role_REG'],
            ['sc_leg_f', 'sc_leg_h', 'sc_leg_p']]

In [3]:
#running an automated feature ranking procedure
#this chunk of code will output n, n-1, ..., and two strongest features and a standard deviation for output score
cols = []
for i in range(len(remaining_features)):
    for j in range(len(remaining_features[i])):
        cols.append(remaining_features[i][j])
        
categorical = ['sen_role_REG', 'sc_leg_f', 'sc_leg_h', 'sc_leg_p']

clf = ExampleClassifier(optimal_parameters) #this line should be replaced by the initialization of any of the classifiers compared and optimal parameters are specified based on grid hyperparameter selection procedure       

#going through the loop and considering all feature combinations in the backward feature elimination procedure
for i in range(len(features) - 1):
    rCLFbe = {} #dictionary for collecting feature names and their corresponding scores in evaluation runs
    X_new = X[cols]
    for j in range(len(remaining_features)):
        X_test = X_new.drop(remaining_features[j], axis = 1)
        col_list = list(X_test.columns)
        cat_feat = []
        for k in categorical:
            try:
                cat_feat.append(col_list.index(k))
            except:
                pass
        if len(cat_feat) < len(col_list) and len(cat_feat) != 0:
            smtnc = SMOTENC(categorical_features = cat_feat, sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
        elif len(cat_feat) == len(col_list):
            smtnc = RandomOverSampler(sampling_strategy = 0.5, random_state = 0)
        elif len(cat_feat) == 0:
            smtnc = SMOTE(sampling_strategy = 0.5, random_state = 0, n_jobs = -1)
        else:
            print('Error')
        model = Pipeline([('smtnc', smtnc), ('clf', clf)])
        acc = []
        for l in range(100):
            ac = cross_val_score(model, X_test, y, scoring = 'accuracy', cv = 10)
            for m in range(len(ac)):
                acc.append(ac[m])
        rCLFbe[str(remaining_features[j])] = sum(acc)/100
    print(len(remaining_features), ' best features are: ', rCLFbe, 'std is ', np.std(acc))
    to_remove = str(max(rCLFbe.items(), key = operator.itemgetter(1))[0])
    remaining_features.remove(literal_eval(to_remove))
    cols = []
    for i in range(len(remaining_features)):
        for j in range(len(remaining_features[i])):
            cols.append(remaining_features[i][j])