# Dissected Feature Selection - Select K-Best

## Libraries and Configurations

Import configuration files

In [51]:
from configparser import ConfigParser

config = ConfigParser()
config.read("../config.ini")

['../config.ini']

Import **data libraries**

In [52]:
import pandas as pd

Import **other libraries**

In [53]:
from rich.progress import Progress
from rich import traceback

traceback.install()

<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x75cac96db250>>

Custom helper scripts

In [54]:
%cd ..
from scripts import plotHelper, encodingHelper
%cd data_exploration_cleaning

/home/bacci/COMPACT/notebooks
/home/bacci/COMPACT/notebooks/data_exploration_cleaning


## Dissected Data

In [55]:
dissected_df_raw_csv = (
    config["DEFAULT"]["interim_path"] + "dissected/std_dissected_df_raw.csv"
)

In [56]:
df = pd.read_csv(dissected_df_raw_csv, index_col=0)

In [57]:
df

Unnamed: 0,MAC Address,Channel,DS Channel,Vendor Specific Tags,Length,Label,Supported Rates 1,Supported Rates 2,Supported Rates 3,Supported Rates 4,...,TIM_Broadcast,BSS_Transition,Multiple_BSSID,Timing_Measurement,SSID_List,DMS,Interworking,QoS_Map,WNM_Notification,Operating_Mode_Notification
0,1a:e6:5a:fe:34:4c,1,1.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,1a:e6:5a:fe:34:4c,1,1.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,1a:e6:5a:fe:34:4c,11,1.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,1a:e6:5a:fe:34:4c,1,2.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,1a:e6:5a:fe:34:4c,11,2.0,,111,iPhone7_F,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13939,da:a1:19:00:17:f9,6,1.0,0050f208002400,182,XiaomiRedmiNote7_S,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
13940,da:a1:19:1a:cc:8f,6,8.0,0050f208002400,182,XiaomiRedmiNote7_S,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
13941,da:a1:19:41:c9:b1,11,5.0,0050f208002400,143,XiaomiRedmiNote7_S,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
13942,da:a1:19:c7:24:b1,1,3.0,0050f208002400,182,XiaomiRedmiNote7_S,1.0,2.0,5.5,11.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [58]:
to_drop = ["MAC Address"]

# Backing up the dataframe
df_backup = df.copy()

df = df.drop(to_drop, axis=1)

## Label Encode Categorical Features

In [59]:
# Import label encoder
from sklearn import preprocessing

In [60]:
label_encoder = preprocessing.LabelEncoder()

df["Vendor Specific Tags"] = label_encoder.fit_transform(df["Vendor Specific Tags"])

## Filling `NaN`

In [61]:
df.fillna("-1", inplace=True)

  df.fillna('-1', inplace=True)


## Splitting Labelled Data

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
X = df.drop(columns=["Label"])
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

## Mutual Information

In [64]:
from sklearn.feature_selection import mutual_info_classif

In [65]:
mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

mi_scores  # show a few features with their MI scores

Length                                                 2.640208
Vendor Specific Tags                                   2.249891
Min_MPDCU_Start_Spacing                                1.551315
WNM_Notification                                       1.088323
Interworking                                           1.023924
QoS_Map                                                1.019215
WNM_Sleep_Mode                                         1.017699
DMS                                                    1.016362
SSID_List                                              0.948083
RX_MSC_Bitmask                                         0.800867
Operating_Mode_Notification                            0.779820
Rx_STBC                                                0.776421
TFS                                                    0.765684
FMS                                                    0.764663
TIM_Broadcast                                          0.756842
Extended_Channel_Switching              

In [67]:
mi_scores[:10]

Length                     2.640208
Vendor Specific Tags       2.249891
Min_MPDCU_Start_Spacing    1.551315
WNM_Notification           1.088323
Interworking               1.023924
QoS_Map                    1.019215
WNM_Sleep_Mode             1.017699
DMS                        1.016362
SSID_List                  0.948083
RX_MSC_Bitmask             0.800867
Name: MI Scores, dtype: float64