# Information Elements Sum Encoding

## Libraries and Configurations

Import configuration files

In [85]:
from configparser import ConfigParser

config = ConfigParser()
config.read("../config.ini")

['../config.ini']

Import **data libraries**

In [86]:
import pandas as pd

Import **other libraries**

In [87]:
from rich.progress import Progress
from rich import traceback

traceback.install()

<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x111908a10>>

Custom helper scripts

In [88]:
%cd ..
from scripts import plotHelper, encodingHelper
%cd data_exploration_cleaning

/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/data_exploration_cleaning


## Import Data

In [89]:
# Combined dataframe
combined_df_csv = config["DEFAULT"]["interim_path"] + "balanced_df_raw.csv"

In [90]:
df = pd.read_csv(combined_df_csv, index_col=0)

df.Timestamp = pd.to_datetime(df.Timestamp)

Filling empty fields with `-1`

In [91]:
df.fillna(-1, inplace=True)

In [92]:
df

Unnamed: 0,Timestamp,MAC Address,Channel,DS Channel,HT Capabilities,Extended Capabilities,Vendor Specific Tags,SSID,Supported Rates,Extended Supported Rates,VHT Capabilities,HE Capabilities,Length,Label
0,2023-05-20 13:52:01.864465952,d2:6b:aa:b5:fb:ed,1,1.0,2d001bff00000000000000000000000000000000000000...,80000000040000020,-1,-1,82848b96,0c1218243048606c,-1,23010808180080203002000d009f08000000fdfffdff39...,135,iPhone12Pro_C
1,2023-05-20 13:52:01.884716034,d2:6b:aa:b5:fb:ed,1,1.0,2d001bff00000000000000000000000000000000000000...,80000000040000020,-1,-1,82848b96,0c1218243048606c,-1,23010808180080203002000d009f08000000fdfffdff39...,135,iPhone12Pro_C
2,2023-05-20 13:52:01.910542011,d2:6b:aa:b5:fb:ed,6,6.0,2d001bff00000000000000000000000000000000000000...,80000000040000020,-1,-1,82848b96,0c1218243048606c,-1,23010808180080203002000d009f08000000fdfffdff39...,135,iPhone12Pro_C
3,2023-05-20 13:52:01.930788994,d2:6b:aa:b5:fb:ed,6,6.0,2d001bff00000000000000000000000000000000000000...,80000000040000020,-1,-1,82848b96,0c1218243048606c,-1,23010808180080203002000d009f08000000fdfffdff39...,135,iPhone12Pro_C
4,2023-05-20 13:52:01.968745947,d2:6b:aa:b5:fb:ed,11,11.0,2d001bff00000000000000000000000000000000000000...,80000000040000020,-1,-1,82848b96,0c1218243048606c,-1,23010808180080203002000d009f08000000fdfffdff39...,135,iPhone12Pro_C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13939,2021-07-07 12:02:57.579541922,da:a1:19:00:17:f9,6,1.0,2d0113ff00000000000000000000000000000000000000...,04000a02004000408001,0050f208002400,-1,02040b16,0c1218243048606c,92719033feff8601feff8601,02000f,182,XiaomiRedmiNote7_S
13940,2021-07-07 11:32:04.533828019,da:a1:19:1a:cc:8f,6,8.0,2d0113ff00000000000000000000000000000000000000...,04000a02004000408001,0050f208002400,Wind3 HUB-6D1619,02040b16,0c1218243048606c,92719033feff8601feff8601,020010,198,XiaomiRedmiNote7_S
13941,2021-07-07 11:46:50.089955091,da:a1:19:41:c9:b1,11,5.0,2d0113ff00000000000000000000000000000000000000...,04000a02004000408001,0050f208002400,-1,02040b16,0c1218243048606c,92719033feff8601feff8601,020025,143,XiaomiRedmiNote7_S
13942,2021-07-07 12:16:31.309731960,da:a1:19:c7:24:b1,1,3.0,2d0113ff00000000000000000000000000000000000000...,04000a02004000408001,0050f208002400,-1,02040b16,0c1218243048606c,92719033feff8601feff8601,020010,182,XiaomiRedmiNote7_S


Dropping columns that are used only by a small percentage of devices, based on the `data_visualization_statistics.ipynb`

In [93]:
df.drop(columns=["SSID", "HE Capabilities", "VHT Capabilities"], inplace=True)

## Hashing Columns using SUM function

### HT Capabilities

Number of unique `HT Capabilities`

In [94]:
print("Unique HT Capabilities:", df["HT Capabilities"].nunique())

Unique HT Capabilities: 23


Hashing

In [95]:
df["HT Capabilities SUM"] = df["HT Capabilities"].apply(
    lambda x: -1 if x == -1 else sum([ord(c) for c in x])
)

Checking collisions

In [96]:
encodingHelper.check_collisions(df, "HT Capabilities")

2667 : 	 6e0103ff00000000000000000000000000000000000000000000 ['XiaomiA2_E' 'XiaomiRedmi4_B' 'XiaomiRedmi5_J']
	 2d0117ff00000000000000000000000000000000000000000000 ['SamsungM31_A']
	 2d1017ff00000000000000000000000000000000000000000000 ['SamsungS4_C']



From this we can see that several devices fall under the same hash of the `HT Capabilties` column: `XiaomiA2_E`, `XiaomiRedmi4_B`, `XiaomiRedmi5_J`, `SamsungM31_A`, `SamsungS4_C`. We can conclude that SUM is not a good function for hashing this particular kind of information elements.

### Extended Capabilities

In [97]:
print("Unique Extended Capabilities:", df["Extended Capabilities"].nunique())

Unique Extended Capabilities: 26


In [98]:
df["Extended Capabilities SUM"] = df["Extended Capabilities"].apply(
    lambda x: -1 if x == -1 else sum([ord(c) for c in x])
)

Checking for collisions

In [99]:
encodingHelper.check_collisions(df, "Extended Capabilities")

No collision detected


### Vendor Specific Tags

In [100]:
print("Unique Vendor Specific Tags:", df["Vendor Specific Tags"].nunique())

Unique Vendor Specific Tags: 63


In [101]:
df["Vendor Specific Tags SUM"] = df["Vendor Specific Tags"].apply(
    lambda x: -1 if x == -1 else sum([ord(c) for c in x])
)

Check collisions

In [102]:
encodingHelper.check_collisions(df, "Vendor Specific Tags")

742 : 	 0050f208000100 ['XiaomiRedmi4_B' 'XiaomiRedmi5_J']
	 0050f208001000 ['XiaomiRedmi4_B' 'XiaomiRedmi5_J']

743 : 	 0050f208001100 ['SamsungM31_A' 'XiaomiRedmi5_J']
	 0050f208000200 ['XiaomiRedmi4_B' 'XiaomiRedmi5_J']

749 : 	 0050f208006200 ['XiaomiRedmi4_B' 'XiaomiRedmi5_J']
	 0050f208008000 ['XiaomiRedmi4_B' 'XiaomiRedmi5_J']
	 0050f208000800 ['XiaomiRedmi4_B' 'XiaomiRedmi5_J']

750 : 	 0050f208002700 ['GooglePixel3A_V' 'XiaomiRedmiNote9S_T' 'GooglePixel3A_L']
	 0050f208007200 ['XiaomiRedmi4_B']
	 0050f208006300 ['XiaomiRedmi4_B' 'XiaomiRedmi5_J']
	 0050f208009000 ['XiaomiRedmi5_J']

751 : 	 0050f208002800 ['OppoFindX3Neo_A']
	 0050f208006400 ['XiaomiRedmi4_B']
	 0050f208009100 ['XiaomiRedmi5_J']

794 : 	 0050f208000e00 ['XiaomiRedmi4_B']
	 0050f208001d00 ['XiaomiRedmi4_B']
	 0050f208002c00 ['XiaomiRedmi5_J']

842 : 	 0050f20800120011 ['HuaweiP10_Q']
	 0050f20800120002 ['HuaweiP10_Q']

844 : 	 0050f20800120040 ['HuaweiP10_Q']
	 0050f20800dc00 ['XiaomiRedmi4_B']

845 : 	 0050f20

### Supported Rates

In [103]:
print("Unique Supported Rates:", df["Supported Rates"].nunique())

Unique Supported Rates: 6


Hashing

In [104]:
df["Supported Rates SUM"] = df["Supported Rates"].apply(
    lambda x: -1 if x == -1 else sum([ord(c) for c in x])
)

Checking collisions

In [105]:
encodingHelper.check_collisions(df, "Supported Rates")

No collision detected


### Extended Supported Rates

In [106]:
print("Unique Extended Supported Rates:", df["Extended Supported Rates"].nunique())

Unique Extended Supported Rates: 4


Hashing

In [107]:
df["Extended Supported Rates SUM"] = df["Extended Supported Rates"].apply(
    lambda x: -1 if x == -1 else sum([ord(c) for c in x])
)

Checking collisions

In [108]:
encodingHelper.check_collisions(df, "Extended Supported Rates")

No collision detected


## Saving encoded dataframe

Remove original columns

In [109]:
df = df[
    [
        "Timestamp",
        "MAC Address",
        "Channel",
        "DS Channel",
        "HT Capabilities SUM",
        "Extended Capabilities SUM",
        "Vendor Specific Tags SUM",
        "Supported Rates SUM",
        "Extended Supported Rates SUM",
        "Length",
        "Label",
    ]
]

In [110]:
df.to_csv(config["DEFAULT"]["interim_path"] + "encoded_SUM_balanced_df.csv")