# Binary Dataset - Probabilistic Fingerprint

**Description**: Generate a binary dataset, starting from individual device files, to be used for Probabilistic Fingerprint train and test

**Inputs**: `data/binary/*`

**Outputs**: `data/interim/PF_binary.csv`

## Notebook Setup

### Libraries

In [3]:
import os
import sys
sys.path.append('C:/Users/fabio/Documents/GitHub/CompactProbes/notebooks')
from modules import fancyData
import numpy as np
import pandas as pd
from rich import traceback

In [4]:
traceback.install()

<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x000001DB913AECF0>>

### Configurations

In [6]:
from configparser import ConfigParser

config = ConfigParser()
config.read("../config.ini")

['../config.ini']

## Import Data

In [7]:
binary_path = config["DEFAULT"]["binary_path"]

In [9]:
df = fancyData.load_and_concat_csv(binary_path)

In [10]:
df = df.astype(str)

Remove noise from dataset

In [11]:
df = df[df["mac"] != "00:0f:00:6a:68:8b"]

## Split Non-Random Bursts

In [12]:
def generate_random_mac():
    import random

    return ":".join(f"{random.randint(0, 255):02x}" for _ in range(6))

In [13]:
def split_non_random_bursts(data: pd.DataFrame, labels: list) -> pd.DataFrame:
    for label in labels:
        # Filter rows with the specific label
        label_data = data[data["label"] == label]
        label_data = label_data.sort_index().reset_index()

        # Initialize variables to track the current burst
        random_mac = generate_random_mac()  # Initial random MAC for the first burst
        start_index = 0  # Start index of the current burst

        for i in range(1, len(label_data)):
            # Check for a drop in DS Channel
            if int(label_data.at[i, "dsss_parameter"], 2) < int(
                label_data.at[i - 1, "dsss_parameter"], 2
            ):
                # Update all rows in the current burst with the current random MAC
                for j in range(start_index, i):
                    original_index = label_data.loc[j, "index"]
                    data.loc[original_index, "mac"] = random_mac

                # Generate a new random MAC for the next burst
                random_mac = generate_random_mac()
                # print(f"Channel dropped; assigning new MAC: {random_mac}")

                # Update the start index for the next burst
                start_index = i

        # Update the last burst (from the last drop to the end)
        for j in range(start_index, len(label_data)):
            original_index = label_data.loc[j, "index"]
            data.loc[original_index, "mac"] = random_mac

    return data


In [14]:
non_randomizing_devices = [
    "iPhone12Pro_C",
    "SamsungS6_H",
    "HuaweiL21_D",
    "HuaweiP10_Q",
    "HuaweiP20_G",
    "SamsungS4_C",
]

In [15]:
df = split_non_random_bursts(df, non_randomizing_devices)

## Process Data

In [16]:
df = df.drop(columns=["frame_check_seq", "len_dsss", "ssid"])

In [17]:
# Columns to drop
drop_starts_with = ["e_id_"]
df = df.drop(
    columns=[col for col in df.columns if col.startswith(tuple(drop_starts_with))]
)

In [18]:
df = df.replace("nan", "U")

In [19]:
df = fancyData.pad_columns(df, symbol="U", exclude=["mac", "label"])

Switch from **probe-view** to **burst-view**

In [20]:
def concat_values(series):
    return "".join(series)

In [21]:
# Columns to group by
groupby_column = "mac"

# Column to concatenate
concatenate_column = "dsss_parameter"

agg_dict = {
    col: "first"
    for col in df.columns
    if col != groupby_column and col != concatenate_column
}
agg_dict[concatenate_column] = concat_values

df = df.groupby(groupby_column).agg(agg_dict).reset_index()

In [22]:
# Determine the mean length of the strings in the column
mean_length = df["dsss_parameter"].str.len().mean()

# Round the mean length up to the next integer
rounded_length = int(np.ceil(mean_length))

# Cut any strings that exceed the rounded length
df["dsss_parameter"] = df["dsss_parameter"].str[:rounded_length]

# Zero-pad the strings to the rounded length
df["dsss_parameter"] = df["dsss_parameter"].str.zfill(rounded_length)

In [23]:
df

Unnamed: 0,mac,len_ssid,len_sup_rates,supported_rates,len_ext_sup_rates,ext_sup_rates,len_ht_cap,ht_cap,len_vht_cap,vht_cap,len_ext_cap,ext_cap,len_vst,vst,len_ext_tags,ext_tags,label,dsss_parameter
0,00:0e:37:20:a0:b9,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001100000011000001...
1,00:21:8e:89:56:97,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001100000011000001...
2,00:b1:97:26:e2:88,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000110000001100000110000010...
3,01:26:28:01:e9:d7,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001100000011000000...
4,02:00:00:00:00:00,00000000,00001000,0000001000000100000010110001011000001100000100...,00000100,00110000010010000110000001101100UUUUUUUUUUUUUU...,00011010,0010110100000001000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001001,0000000100000000000010000000000000000000000000...,00000111,0000000001010000111100100000100000000000000100...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungM31_A,0000000100000001000000100000001100000011000001...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5969,ff:22:bf:93:ad:ef,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000010000000110000001100000011000001...
5970,ff:36:e6:f8:81:71,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001100000100000001...
5971,ff:42:ca:c8:ae:95,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001100000011000001...
5972,ff:53:32:d3:42:84,00000000,00000100,00000010000001000000101100010110UUUUUUUUUUUUUU...,00001000,0000110000010010000110000010010000110000010010...,00011010,0010110100010000000101111111111100000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,00001000,0000000000000000000010000000000000000000000000...,00000111,0000000000010000000110000000001000000000000000...,UUUUUUUU,UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU...,SamsungS4_C,0000000100000001000000100000001100000011000001...


Drop other columns: mostly IE length fields, and fields that have been proven to not contribute that much.

In [24]:
columns_to_consider = [col for col in df.columns if col not in ["mac", "label"]]
df = df.drop_duplicates(subset=columns_to_consider)

In [25]:
df = df.drop(
    columns=[
        "len_ssid",
        "len_sup_rates",
        "len_ext_sup_rates",
        "len_vht_cap",
        "len_ext_tags",
        "supported_rates",
        "ext_sup_rates",
        "vht_cap",
        "ext_tags",
    ]
)

## Save Data

In [26]:
df["concatenated"] = (
    df.drop(columns=["label", "mac"]).astype(str).apply(lambda x: "".join(x), axis=1)
)

# If you want to keep the 'label' column in the final dataframe:
df_result = df[["label", "concatenated"]]

df_result = df_result.sort_values("label")

Save to CSV

In [27]:
df_result.to_csv(
    f"{config['DEFAULT']['interim_path']}/binary_U_random.csv", index=False
)