# UJI Dataset Cleaning

## Libraries and Configurations

Import configuration files

In [1]:
from configparser import ConfigParser

config = ConfigParser()
config.read("../../config.ini")

['../../config.ini']

Import **data libraries**

In [2]:
import pandas as pd

Import **other libraries**

In [3]:
from rich.progress import Progress
from rich import traceback

traceback.install()

<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x10fb50ed0>>

Custom helper scripts

In [4]:
%cd ../../
from scripts import plotHelper, encodingHelper
%cd data_exploration_cleaning

/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/data_exploration_cleaning


## Import Data

In [5]:
# Combined dataframe
uji_df_csv = config["UJI"]["extracted_path"] + "/uji_probes.csv"

In [6]:
uji_df = pd.read_csv(uji_df_csv)
uji_df["Timestamp"] = pd.to_datetime(uji_df["Timestamp"], unit="s")

## Drop Not Relevant Columns

Since the dataset is not labelled, we can drop the `Label` column

In [7]:
uji_df.drop(columns=["Label"], inplace=True)

In [8]:
uji_df

Unnamed: 0,Timestamp,MAC Address,Channel,DS Channel,HT Capabilities,Extended Capabilities,Vendor Specific Tags,SSID,Supported Rates,Extended Supported Rates,VHT Capabilities,HE Capabilities,Length
0,2023-02-28 10:06:13.608247995,16:d2:8e:2a:d5:47,6,5.0,2d4017ff00000000000000000000000000000000000000...,0000080400000040,0017f20a00010400000000,SSID_70757882,02040b16,0c1218243048606c,,,145
1,2023-02-28 10:06:13.619611979,16:d2:8e:2a:d5:47,6,5.0,2d4017ff00000000000000000000000000000000000000...,0000080400000040,0017f20a00010400000000,SSID_70757882,02040b16,0c1218243048606c,,,145
2,2023-02-28 10:06:15.874474049,aa:a2:d1:e3:8c:22,6,6.0,2d4017ff00000000000000000000000000000000000000...,0000080400000040,0017f20a00010400000000,SSID_56211587,02040b16,0c1218243048606c,,,145
3,2023-02-28 10:06:17.232656956,86:6f:da:1b:96:4b,9,8.0,2d0917ff00000000000000000000000000000000000000...,050008800000004080,0050f208001200,,02040b160c121824,3048606c,,,107
4,2023-02-28 10:06:17.326913118,60:83:34:df:f8:74,10,9.0,2c0103ff00000000000000000000000000000000000000...,,0050f208000000,SSID_07527841,02040b16,0c1218243048606c,,,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1410829,2023-03-29 10:36:00.696099997,c6:6f:dd:56:6d:7e,11,10.0,ef0113ffff000000000000000000000000000000000000...,00000a8200400000000120,0050f208002800,SSID_44969871,02040b16,0c1218243048606c,92f19033faff6203faff6203,020028,163
1410830,2023-03-29 10:36:00.717555046,c6:6f:dd:56:6d:7e,11,10.0,ef0113ffff000000000000000000000000000000000000...,00000a8200400000000120,0050f208002800,,02040b16,0c1218243048606c,92f19033faff6203faff6203,020014,150
1410831,2023-03-29 10:36:00.722501040,c6:6f:dd:56:6d:7e,11,10.0,ef0113ffff000000000000000000000000000000000000...,00000a8200400000000120,0050f208002800,SSID_44969871,02040b16,0c1218243048606c,92f19033faff6203faff6203,020013,163
1410832,2023-03-29 10:36:03.044989109,ce:1c:b2:ac:5c:9c,2,1.0,ad0917ffff000000000000000000000000000000000000...,04000a8201400040802120,0050f208002a00,,02040b16,0c1218243048606c,92f99133faff0c03faff0c03,2303010082400004334c890d0180c8000c00fafffaff19...,168


## SSID Length Cleaning

In [9]:
uji_df["Length NO SSID"] = uji_df.apply(
    lambda row: (
        row["Length"] - len(row["SSID"])
        if isinstance(row["SSID"], str)
        else row["Length"]
    ),
    axis=1,
)

Export to CSV

In [10]:
uji_df.to_csv(
    config["DEFAULT"]["interim_path"] + "/uji_dataset/uji_clean.csv", index=False
)

## Label Encoding Features

Filling empty fields with `-1`

In [11]:
uji_df.fillna(-1, inplace=True)

Fixing columns data types

In [12]:
uji_df.dtypes

Timestamp                   datetime64[ns]
MAC Address                         object
Channel                              int64
DS Channel                         float64
HT Capabilities                     object
Extended Capabilities               object
Vendor Specific Tags                object
SSID                                object
Supported Rates                     object
Extended Supported Rates            object
VHT Capabilities                    object
HE Capabilities                     object
Length                               int64
Length NO SSID                       int64
dtype: object

In [13]:
# Converting Timestamp to datetime
uji_df["Timestamp"] = pd.to_datetime(uji_df["Timestamp"])

# Converting SSID to string
uji_df["SSID"] = uji_df["SSID"].astype(str)

# Converting MAC Address to string
uji_df["MAC Address"] = uji_df["MAC Address"].astype(str)

# Converting HT Capabilities to string
uji_df["HT Capabilities"] = uji_df["HT Capabilities"].astype(str)

# Converting Extended Capabilities to string
uji_df["Extended Capabilities"] = uji_df["Extended Capabilities"].astype(str)

# Converting Vendor Specific Tags to string
uji_df["Vendor Specific Tags"] = uji_df["Vendor Specific Tags"].astype(str)

# Converting Supported Rates to string
uji_df["Supported Rates"] = uji_df["Supported Rates"].astype(str)

# Converting Extended Supported Rates to string
uji_df["Extended Supported Rates"] = uji_df["Extended Supported Rates"].astype(str)

# Converting VHT Capabilities to string
uji_df["VHT Capabilities"] = uji_df["VHT Capabilities"].astype(str)

# Converting HE Capabilities to string
uji_df["HE Capabilities"] = uji_df["HE Capabilities"].astype(str)

**Label encoding** categorical features for better readability. `-1` is kept as is to indicate empty field

In [14]:
uji_df["HT Capabilities"] = encodingHelper.label_encode_with_exception(
    uji_df["HT Capabilities"]
)
uji_df["HE Capabilities"] = encodingHelper.label_encode_with_exception(
    uji_df["HE Capabilities"]
)
uji_df["Extended Capabilities"] = encodingHelper.label_encode_with_exception(
    uji_df["Extended Capabilities"]
)
uji_df["Vendor Specific Tags"] = encodingHelper.label_encode_with_exception(
    uji_df["Vendor Specific Tags"]
)
uji_df["Supported Rates"] = encodingHelper.label_encode_with_exception(
    uji_df["Supported Rates"]
)
uji_df["Extended Supported Rates"] = encodingHelper.label_encode_with_exception(
    uji_df["Extended Supported Rates"]
)
uji_df["VHT Capabilities"] = encodingHelper.label_encode_with_exception(
    uji_df["VHT Capabilities"]
)

Saving **encoded dataset**

In [15]:
uji_df.to_csv(
    config["DEFAULT"]["interim_path"] + "/uji_dataset/encoded_LABEL_uji_df.csv"
)