# Dissected Feature Selection - Select K-Best

## Libraries and Configurations

Import configuration files

In [38]:
from configparser import ConfigParser

config = ConfigParser()
config.read("../config.ini")

['../config.ini']

Import **data libraries**

In [39]:
import pandas as pd

Import **other libraries**

In [40]:
from rich.progress import Progress
from rich import traceback

traceback.install()

<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x7815d33c0f50>>

Custom helper scripts

In [41]:
%cd ..
from scripts import plotHelper, encodingHelper
%cd data_exploration_cleaning

/home/bacci/COMPACT/notebooks
/home/bacci/COMPACT/notebooks/data_exploration_cleaning


## Dissected Data

In [42]:
dissected_df_raw_csv = (
    config["DEFAULT"]["interim_path"] + "dissected/std_dissected_df_raw.csv"
)

In [43]:
df = pd.read_csv(dissected_df_raw_csv, index_col=0)

In [44]:
df

Unnamed: 0,MAC Address,Channel,DS Channel,Vendor Specific Tags,Length,Label,Supported Rates 1,Supported Rates 2,Supported Rates 3,Supported Rates 4,...,TIM_Broadcast,BSS_Transition,Multiple_BSSID,Timing_Measurement,SSID_List,DMS,Interworking,QoS_Map,WNM_Notification,Operating_Mode_Notification
0,1a:e6:5a:fe:34:4c,1,1.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,1a:e6:5a:fe:34:4c,1,1.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,1a:e6:5a:fe:34:4c,11,1.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,1a:e6:5a:fe:34:4c,1,2.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,1a:e6:5a:fe:34:4c,11,2.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13939,da:a1:19:00:17:f9,6,1.0,0050f208002400,182,XiaomiRedmiNote7_S,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
13940,da:a1:19:1a:cc:8f,6,8.0,0050f208002400,182,XiaomiRedmiNote7_S,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
13941,da:a1:19:41:c9:b1,11,5.0,0050f208002400,143,XiaomiRedmiNote7_S,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
13942,da:a1:19:c7:24:b1,1,3.0,0050f208002400,182,XiaomiRedmiNote7_S,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [45]:
to_drop = ["MAC Address"]

# Backing up the dataframe
df_backup = df.copy()

df = df.drop(to_drop, axis=1)

## Label Encode Categorical Features

In [46]:
# Import label encoder
from sklearn import preprocessing

In [47]:
label_encoder = preprocessing.LabelEncoder()

df["Vendor Specific Tags"] = label_encoder.fit_transform(df["Vendor Specific Tags"])

## Filling `NaN`

In [48]:
df.fillna("-1", inplace=True)

  df.fillna("-1", inplace=True)


## Data Normalization

In [51]:
from sklearn.preprocessing import MinMaxScaler

In [52]:
scaler = MinMaxScaler()
columns_to_normalize = df.columns[df.columns != "Label"]
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

## Splitting Labelled Data

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X = df.drop(columns=["Label"])
y = df["Label"]

## Mutual Information

In [53]:
from sklearn.feature_selection import mutual_info_classif

In [54]:
mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

mi_scores  # show a few features with their MI scores

Length                                                 2.641826
Vendor Specific Tags                                   2.250775
Min_MPDCU_Start_Spacing                                1.552503
WNM_Notification                                       1.084997
Interworking                                           1.018736
DMS                                                    1.018440
WNM_Sleep_Mode                                         1.017905
QoS_Map                                                1.014470
SSID_List                                              0.953191
RX_MSC_Bitmask                                         0.795849
Operating_Mode_Notification                            0.777593
Rx_STBC                                                0.775735
FMS                                                    0.764478
TIM_Broadcast                                          0.763583
TFS                                                    0.761536
Extended_Channel_Switching              

In [55]:
mi_scores[:10]

Length                     2.641826
Vendor Specific Tags       2.250775
Min_MPDCU_Start_Spacing    1.552503
WNM_Notification           1.084997
Interworking               1.018736
DMS                        1.018440
WNM_Sleep_Mode             1.017905
QoS_Map                    1.014470
SSID_List                  0.953191
RX_MSC_Bitmask             0.795849
Name: MI Scores, dtype: float64

## Select K-Best

In [56]:
from sklearn.feature_selection import SelectKBest, chi2

In [57]:
selector = SelectKBest(score_func=chi2, k=5)
selector.fit(X, y)