In [1]:
# This notebook is used for the feature selection stage of the AML project.
# It takes the pre-processed data as input and then tries several different feature selection techniques.
# The output is a list of the selected features or a reduced dimension dataset.
# The final chosen feature selection method will be used in the final .py file for the project.

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA

np.random.seed(999)

  from numpy.core.umath_tests import inner1d


In [2]:
# Read in pre-processed training data
file = 'ScaledDataSet.csv'
data = pd.read_csv(file)

In [4]:
# Define our features and our target classifier
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

# Initial no. of features
print(len(X.columns))

152


In [5]:
# First, remove all features with zero variance i.e. features with constant values. 
#This will be a step in every feature selection process we run.
X = X.loc[:,X.apply(pd.Series.nunique) != 1]

# Reduced no. of features
print(len(X.columns))

78


In [6]:
# Chi squared test will not work as we have some negative values

In [6]:
# Try RFE with logistic regression
model = LogisticRegression()

# Try 10 features
rfe_log = RFE(model, 10)
fit_rfe_log = rfe_log.fit(X, Y)
# Create list with names of features
rfe_log_features = X.columns[fit_rfe_log.get_support()]

# Print names of selected features
for f in rfe_log_features:
    print(f)

radiotap.datarate
wlan.fc.pwrmgt
wlan.fc.moredata
wlan.fc.protected
wlan_mgt.fixed.capabilities.preamble
wlan_mgt.fixed.capabilities.short_slot_time
wlan_mgt.fixed.timestamp
wlan_mgt.fixed.auth_seq
wlan_mgt.rsn.akms.type
wlan.wep.key


In [9]:
# Try Feature Importance using a tree classifier
model_tree = ExtraTreesClassifier()
fit_tree = model_tree.fit(X,Y)
# Get feature importances
feature_importance = fit_tree.feature_importances_

# Join feature importances to feature names and rank by importance
feature_ranking_data = {'Feature':X.columns.values, 'Importance': feature_importance}
tree_features = pd.DataFrame(data=feature_ranking_data).nlargest(10,'Importance')
# Get list object of the top 10 features
tree_features = list(tree_features['Feature'])
for f in tree_features:
    print(f)

wlan.fc.subtype
radiotap.channel.type.cck
wlan.fc.type_subtype
wlan.fc.pwrmgt
radiotap.channel.type.ofdm
frame.cap_len
wlan.fc.type
wlan.qos.priority
wlan.wep.icv
radiotap.mactime


In [10]:
# How similar are results from trees classifier to RFE logistic model
tree_set = set(tree_features)
rfe_log_set = set(rfe_log_features)

common_features = (tree_set & rfe_log_set)
no_common_features = len(common_features)

print('Number of common features: %s' % no_common_features)
print(common_features)

Number of common features: 1
{'wlan.fc.pwrmgt'}


In [11]:
# Try RFE with SVC
model_svm = LinearSVC()
rfe_svm = RFE(model_svm, 10)
fit_rfe_svm = rfe_svm.fit(X,Y)

# Create list with names of features
rfe_svm_features = X.columns[fit_rfe_svm.get_support()]

# Print names of selected features
for f in rfe_svm_features:
    print(f)

wlan.fc.moredata
wlan.fc.protected
wlan.bssid
wlan_mgt.fixed.capabilities.ess
wlan_mgt.fixed.capabilities.preamble
wlan_mgt.fixed.listen_ival
wlan_mgt.fixed.timestamp
wlan_mgt.tim.dtim_period
wlan_mgt.rsn.akms.type
data.len


In [12]:
# Find common features between svm and logistic model
rfe_svm_set = set(rfe_svm_features)

common_features_svm_log = (rfe_svm_set & rfe_log_set)
common_features_svm_trees = (rfe_svm_set & tree_set)

print('Number of common features with svm and log: %s' % len(common_features_svm_log))
print(common_features_svm_log)

print('Number of common features with svm and tree: %s' % len(common_features_svm_trees))
print(common_features_svm_trees)

Number of common features with svm and log: 5
{'wlan.fc.protected', 'wlan_mgt.rsn.akms.type', 'wlan.fc.moredata', 'wlan_mgt.fixed.timestamp', 'wlan_mgt.fixed.capabilities.preamble'}
Number of common features with svm and tree: 0
set()


In [13]:
# Try PCA on the data

pca = PCA(n_components=5)
fit_pca = pca.fit(X)

# summarize components
print("Explained Variance: %s" % fit_pca.explained_variance_ratio_) 
print("Total variance explained by first 5 components: %s" % sum(fit_pca.explained_variance_ratio_))

Explained Variance: [0.80629725 0.12093112 0.02414355 0.0183569  0.01252285]
Total variance explained by first 5 components: 0.982251662135151


In [25]:
type(rfe_log_features)

pandas.core.indexes.base.Index

In [8]:
# Create a dataframe with just the features we're interested in
cols_to_use = rfe_log_features.union(['class'])
print(cols_to_use)
fs_output = data[cols_to_use]

Index(['class', 'radiotap.datarate', 'wlan.fc.moredata', 'wlan.fc.protected',
       'wlan.fc.pwrmgt', 'wlan.wep.key', 'wlan_mgt.fixed.auth_seq',
       'wlan_mgt.fixed.capabilities.preamble',
       'wlan_mgt.fixed.capabilities.short_slot_time',
       'wlan_mgt.fixed.timestamp', 'wlan_mgt.rsn.akms.type'],
      dtype='object')


In [34]:
fs_output.to_csv(r'FeatureSelectionOutput.csv', index=False)

In [10]:
# Select same features on test data for prediction

# Read in pre-processed test data
test_file = 'ScaledTestDataSet.csv'
test_data = pd.read_csv(file)

fs_test_output = test_data[cols_to_use]
#fs_test_output.to_csv(r'FeatureSelectionTestOutput.csv',index=False)

NameError: name 'cols_to_use' is not defined

In [11]:
len(test_data)

97044