In [1]:
# This notebook is used for the feature selection stage of the AML project.
# It takes the pre-processed data as input and then tries several different feature selection techniques.
# The output is a list of the selected features or a reduced dimension dataset.
# The final chosen feature selection method will be used in the final .py file for the project.

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
# Read in pre-processed training data
file = 'ScaledDataSet.csv'
data = pd.read_csv(file)

In [3]:
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

In [4]:
# Initial no. of features
print(len(X.columns))

152


In [5]:
# First, remove all features with zero variance i.e. features with constant values. This will be a step in every feature selection process we run.
X = X.loc[:,X.apply(pd.Series.nunique) != 1]

In [6]:
# Reduced no. of features
print(len(X.columns))

78


In [7]:
X.describe()

Unnamed: 0,frame.time_delta,frame.time_delta_displayed,frame.len,frame.cap_len,radiotap.length,radiotap.present.tsft,radiotap.present.flags,radiotap.present.channel,radiotap.present.dbm_antsignal,radiotap.present.antenna,...,wlan_mgt.rsn.capabilities.ptksa_replay_counter,wlan_mgt.tcprep.trsmt_pow,wlan.wep.iv,wlan.wep.key,wlan.wep.icv,wlan.tkip.extiv,wlan.ccmp.extiv,wlan.qos.tid,wlan.qos.priority,data.len
count,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,...,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0,97044.0
mean,0.56921,0.56921,9.175521,9.175521,-0.000216,-0.000216,-0.000216,-0.000216,-0.000216,-0.000216,...,1e-05,4e-06,-0.118054,0.003775,0.143891,0.00554,0.000346,0.028436,0.028436,6.475976
std,3.473759,3.473759,20.856995,20.856995,0.014709,0.014709,0.014709,0.014709,0.014709,0.014709,...,0.00321,0.00122,0.582379,0.036295,0.628309,0.060476,0.013305,0.062765,0.062765,15.083074
min,-0.827695,-0.827695,-2.23067,-2.23067,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,-0.729577,0.0,-0.481892,0.0,0.0,0.0,0.0,-1.0
25%,-0.506057,-0.506057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.729577,0.0,-0.481892,0.0,0.0,0.0,0.0,-1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.493943,0.493943,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.270423,0.0,0.518108,0.0,0.0,0.0,0.0,0.0
max,217.876123,217.876123,56.613628,56.613628,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.38,1.509213,1.0,1.911506,0.99424,0.86556,1.0,1.0,40.888326


In [8]:
# Chi squared test will not work as we have some negative values

In [None]:
# Try RFE with logistic regression
model = LogisticRegression()

# Try 10 features
rfe_log = RFE(model, 10)
fit_rfe_log = rfe_log.fit(X, Y)
# Create list with names of features
rfe_log_features = X.columns[fit_rfe_log.get_support()]

# Print names of selected features
for f in rfe_log_features:
    print(f)

In [None]:
# Try RFE with SVC
model_svc = SVC(kernel="linear")
rfe_svc = RFE(model_svc,10)
fit_rfe_svc = rfe_svc.fit(X,Y)

In [None]:
X.columns[fit_rfe_svc.get_support()]

In [None]:
# Try Feature Importance using a tree classifier
model_tree = ExtraTreesClassifier()
fit_tree = model_tree.fit(X,Y)

In [None]:
# Try Forward selection