# Binary Dataset - Probabilistic Fingerprint

**Description**: Generate a binary dataset, starting from individual device files, to be used for Probabilistic Fingerprint train and test

**Inputs**: `data/binary/*`

**Outputs**: `data/interim/PF_binary.csv`

## Notebook Setup

### Libraries

In [1]:
import os
import sys
sys.path.append('C:/Users/fabio/Documents/GitHub/CompactProbes/notebooks')
from modules import fancyData
import numpy as np
import pandas as pd
from rich import traceback

In [2]:
traceback.install()

<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x00000253B3CCEF90>>

### Configurations

In [3]:
from configparser import ConfigParser

config = ConfigParser()
config.read("../config.ini")

['../config.ini']

## Import Data

In [4]:
binary_path = config["DEFAULT"]["binary_path"]

In [5]:
df = fancyData.load_and_concat_csv(binary_path)

In [6]:
df = df.astype(str)

Remove noise from dataset

In [7]:
df = df[df["mac"] != "00:0f:00:6a:68:8b"]

## Split Non-Random Bursts

In [8]:
def generate_random_mac():
    import random

    return ":".join(f"{random.randint(0, 255):02x}" for _ in range(6))

In [9]:
def split_non_random_bursts(data: pd.DataFrame, labels: list) -> pd.DataFrame:
    for label in labels:
        # Filter rows with the specific label
        label_data = data[data["label"] == label]
        label_data = label_data.sort_index().reset_index()

        # Initialize variables to track the current burst
        random_mac = generate_random_mac()  # Initial random MAC for the first burst
        start_index = 0  # Start index of the current burst

        for i in range(1, len(label_data)):
            # Check for a drop in DS Channel
            if int(label_data.at[i, "dsss_parameter"], 2) < int(
                label_data.at[i - 1, "dsss_parameter"], 2
            ):
                # Update all rows in the current burst with the current random MAC
                for j in range(start_index, i):
                    original_index = label_data.loc[j, "index"]
                    data.loc[original_index, "mac"] = random_mac

                # Generate a new random MAC for the next burst
                random_mac = generate_random_mac()
                # print(f"Channel dropped; assigning new MAC: {random_mac}")

                # Update the start index for the next burst
                start_index = i

        # Update the last burst (from the last drop to the end)
        for j in range(start_index, len(label_data)):
            original_index = label_data.loc[j, "index"]
            data.loc[original_index, "mac"] = random_mac

    return data


In [10]:
non_randomizing_devices = [
    "iPhone12Pro_C",
    "SamsungS6_H",
    "HuaweiL21_D",
    "HuaweiP10_Q",
    "HuaweiP20_G",
    "SamsungS4_C",
]

In [11]:
df = split_non_random_bursts(df, non_randomizing_devices)

## Process Data

In [12]:
df = df.drop(columns=["frame_check_seq", "len_dsss", "ssid"])

In [13]:
# Columns to drop
drop_starts_with = ["e_id_"]
df = df.drop(
    columns=[col for col in df.columns if col.startswith(tuple(drop_starts_with))]
)

In [14]:
df = df.replace("nan", "U")

In [15]:
df = fancyData.pad_columns(df, symbol="U", exclude=["mac", "label"])


In [None]:
len_vst_fixed = 1336
new_padded_vst = fancyData.pad_columns(df[["mac","vst"]],symbol="U",length=len_vst_fixed)
df["vst"] = new_padded_vst["vst"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna("").astype(str).str.ljust(max_length, symbol)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna("").astype(str).str.ljust(max_length, symbol)


In [17]:
# print avg length of the concatenated column
print(
    f"Average length of the VST column: {df['vst'].str.len().mean()}"
)

Average length of the VST column: 1336.0


Switch from **probe-view** to **burst-view**

In [18]:
def concat_values(series):
    return "".join(series)

In [19]:
# Columns to group by
groupby_column = "mac"

# Column to concatenate
concatenate_column = "dsss_parameter"

agg_dict = {
    col: "first"
    for col in df.columns
    if col != groupby_column and col != concatenate_column
}
agg_dict[concatenate_column] = concat_values

df = df.groupby(groupby_column).agg(agg_dict).reset_index()

In [20]:
dsss_fixed_len = 128
# Determine the mean length of the strings in the column
#mean_length = df["dsss_parameter"].str.len().mean()
mean_length = dsss_fixed_len

# Round the mean length up to the next integer
rounded_length = int(np.ceil(mean_length))

# Cut any strings that exceed the rounded length
df["dsss_parameter"] = df["dsss_parameter"].str[:rounded_length]

# Zero-pad the strings to the rounded length
df["dsss_parameter"] = df["dsss_parameter"].str.zfill(rounded_length)

In [21]:
df

Unnamed: 0,mac,len_ssid,len_sup_rates,supported_rates,len_ext_sup_rates,ext_sup_rates,len_ht_cap,ht_cap,len_vht_cap,vht_cap,len_ext_cap,ext_cap,len_vst,vst,len_ext_tags,ext_tags,label,dsss_parameter
0,00:5b:fa:b2:cc:04,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000000000000000000000000000000000001000000...
1,00:93:26:55:d5:73,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000000000000000000010000000100000010000000...
2,00:c3:b1:aa:4d:6d,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001000000011000000...
3,01:03:4b:84:6b:c8,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001000000011000000...
4,01:31:aa:b6:c8:31,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000000000001000000010000001000000011000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5969,fe:fc:aa:d1:89:d1,00000000,00000100,10000010100001001000101110010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110101000000000110111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000010000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00011100,0010001100000001000010000000100000011000000000...,iPhone12_W,0000000000000000000000000000000000000000000000...
5970,fe:ff:ff:0f:64:5d,00000000,00000100,10000010100001001000101110010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110101000000000110111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000010000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00011100,0010001100000001000010000000100000011000000000...,iPhone11_B,0000000000000000000000000000000000000000000000...
5971,ff:10:3b:8b:65:cd,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001000000011000000...
5972,ff:70:3e:34:2a:24,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000000000001000000010000001000000010000000...


Drop other columns: mostly IE length fields, and fields that have been proven to not contribute that much.

In [22]:
columns_to_consider = [col for col in df.columns if col not in ["mac", "label"]]
df = df.drop_duplicates(subset=columns_to_consider)
print(columns_to_consider)

['len_ssid', 'len_sup_rates', 'supported_rates', 'len_ext_sup_rates', 'ext_sup_rates', 'len_ht_cap', 'ht_cap', 'len_vht_cap', 'vht_cap', 'len_ext_cap', 'ext_cap', 'len_vst', 'vst', 'len_ext_tags', 'ext_tags', 'dsss_parameter']


In [23]:

df = df.drop(
    columns=[
        "len_ssid",
        "len_sup_rates",
        "len_ext_sup_rates",
        "len_vht_cap",
        "len_ext_tags",
        "supported_rates",
        "ext_sup_rates",
        "vht_cap",
        "ext_tags",
    ]
)
print(df.columns)
#cols = ["len_ssid","len_sup_rates","supported_rates",
#       "len_ext_sup_rates","ext_sup_rates","dsss_parameter",
#       "len_ht_cap","ht_cap","len_ext_cap","ext_cap","len_vht_cap",
#       "vht_cap","len_vst","vst","len_ext_tags","ext_tags"]

Index(['mac', 'len_ht_cap', 'ht_cap', 'len_ext_cap', 'ext_cap', 'len_vst',
       'vst', 'label', 'dsss_parameter'],
      dtype='object')


## Save Data

In [24]:
df["concatenated"] = (
    df.drop(columns=["label", "mac"]).astype(str).apply(lambda x: "".join(x), axis=1)
)

# If you want to keep the 'label' column in the final dataframe:
df_result = df[["label", "concatenated"]]

df_result = df_result.sort_values("label")

In [25]:
for col in df.columns:
    print(
    f"Average length of the column {col}: {df[col].str.len().mean()}"
)

Average length of the column mac: 17.0
Average length of the column len_ht_cap: 8.0
Average length of the column ht_cap: 208.0
Average length of the column len_ext_cap: 8.0
Average length of the column ext_cap: 88.0
Average length of the column len_vst: 8.0
Average length of the column vst: 1336.0
Average length of the column label: 12.428571428571429
Average length of the column dsss_parameter: 128.0
Average length of the column concatenated: 1784.0


Save to CSV

In [26]:
df_result.to_csv(
    f"{config['DEFAULT']['interim_path']}/binary_U_random_NONDROPPED.csv", index=False
)

In [27]:
# print avg length of the concatenated column
print(
    f"Average length of the concatenated column: {df_result['concatenated'].str.len().mean()}"
)

Average length of the concatenated column: 1784.0
