# BAMBOO: Binary descriptor based on AsymMetric pairwise BOOsting

In this notebook we include the implementation of the BAMBOO descriptor to provide a compressed representation of probe requests.

## Libraries and Configurations

Import configuration files

In [2]:
from configparser import ConfigParser

config = ConfigParser()
config.read("../config.ini")

['../config.ini']

Import **data libraries**

In [3]:
import pandas as pd

Import **other libraries**

In [4]:
from rich.progress import Progress
from rich import traceback

traceback.install()

from tqdm.notebook import tqdm

In [5]:
import numpy as np

## Import Data

## BAMBOO

Input:
- Ground truth relationships $\langle x_{a(n)}, x_{b(n)}; y_n\rangle$
  - $n=1,..,N$
  - $y_n \in \{+1, -1\}$
- A set of filters $\mathcal{H} = \{h_1 , ..., h_F\}$
- A set of binarization thresholds $\mathcal{T} = \{t_1 , ..., t_T\}$

Output:
- A set of $M<F$ filters $[h_{i(1)}, ..., h_{i(M)}]$
- Corresponding set of binarization thresholds $[t_{j(1)}, ..., t_{j(M)}]$

Importing **concatenated columns** and **pairs** datasets

In [71]:
pairs_df = pd.read_csv("../../data/interim/pairs_df.csv", index_col=0)

In [72]:
pairs_df

Unnamed: 0,Item 1,Item 2,Equality
0,0,1,1
1,0,2,1
2,0,3,1
3,0,4,1
4,0,5,1
...,...,...,...
11493610,4791,4793,1
11493611,4791,4794,1
11493612,4792,4793,1
11493613,4792,4794,1


In [74]:
strings_df = pd.read_csv("../../data/interim/string_df.csv", index_col=0)

In [75]:
strings_df

Unnamed: 0,0
0,0000000000000100000000100000010000001011000101...
1,0000000000000100000000100000010000001011000101...
2,0000000000000100000000100000010000001011000101...
3,0000000000000100000000100000010000001011000101...
4,0000000000000100000000100000010000001011000101...
...,...
4790,0000110000000100000000100000010000001011000101...
4791,0000101100000100000000100000010000001011000101...
4792,0001001000000100000000100000010000001011000101...
4793,0000100100000100000000100000010000001011000101...


Importing bitmask **filters**

In [76]:
filters = pd.read_csv("../../data/filters/bitmasks.csv", index_col=0)

In [77]:
filters

Unnamed: 0,Columns,Bitmask
0,"('len_ssid',)",1111111100000000000000000000000000000000000000...
1,"('len_sup_rates',)",0000000011111111000000000000000000000000000000...
2,"('supported_rates',)",0000000000000000111111111111111111111111111111...
3,"('len_ext_sup_rates',)",0000000000000000000000000000000000000000000000...
4,"('ext_sup_rates',)",0000000000000000000000000000000000000000000000...
...,...,...
65530,"('len_ssid', 'len_sup_rates', 'supported_rates...",1111111111111111111111111111111111111111111111...
65531,"('len_ssid', 'len_sup_rates', 'len_ext_sup_rat...",1111111111111111000000000000000000000000000000...
65532,"('len_ssid', 'supported_rates', 'len_ext_sup_r...",1111111100000000111111111111111111111111111111...
65533,"('len_sup_rates', 'supported_rates', 'len_ext_...",0000000011111111111111111111111111111111111111...


In [12]:
def generate_thresholds(bitmasks):
    """
    Generate thresholds for each bitmask in a set.

    Parameters:
        bitmasks (set): A set containing the bitmasks.

    Returns:
        dict: A dictionary where keys are bitmasks and values are sets of thresholds.
    """
    thresholds_dict = {}
    for bitmask in bitmasks:
        max_ones = bitmask.count("1")
        thresholds = set(range(max_ones + 1))
        thresholds_dict[bitmask] = thresholds
    return thresholds_dict

In [13]:
# # Example usage:
# thresholds_dict = generate_thresholds(filters)
# for bitmask, thresholds in thresholds_dict.items():
#     print(f"Bitmask: {bitmask}, Thresholds: {thresholds}")

In [14]:
thresholds = [0, 1, 2, 3, 4]

## Mockup Implementation

In [31]:
dataset = [
    ("000000001111111100000000", "000000011111111000000000", 1),
    ("000000001111111100000000", "000000111111110000000000", 1),
    ("000000001111111100000000", "000001111111100000000000", 1),
    ("000000001111111100000000", "000001111000000011111111", -1),
    ("000000001111111100000000", "000001110000000011111111", -1),
    ("000000011111111000000000", "000000111111110000000000", 1),
    ("000000011111111000000000", "000001111111100000000000", 1),
    ("000000011111111000000000", "000001111000000011111111", -1),
    ("000000011111111000000000", "000001110000000011111111", -1),
    ("000000111111110000000000", "000001111111100000000000", 1),
    ("000000111111110000000000", "000001111000000011111111", -1),
    ("000000111111110000000000", "000001110000000011111111", -1),
    ("000001111111100000000000", "000001111000000011111111", -1),
    ("000001111111100000000000", "000001110000000011111111", -1),
    ("111111110000000011111111", "111111111000000011111111", 1),
]

In [32]:
dataset

[('000000001111111100000000', '000000011111111000000000', 1),
 ('000000001111111100000000', '000000111111110000000000', 1),
 ('000000001111111100000000', '000001111111100000000000', 1),
 ('000000001111111100000000', '000001111000000011111111', -1),
 ('000000001111111100000000', '000001110000000011111111', -1),
 ('000000011111111000000000', '000000111111110000000000', 1),
 ('000000011111111000000000', '000001111111100000000000', 1),
 ('000000011111111000000000', '000001111000000011111111', -1),
 ('000000011111111000000000', '000001110000000011111111', -1),
 ('000000111111110000000000', '000001111111100000000000', 1),
 ('000000111111110000000000', '000001111000000011111111', -1),
 ('000000111111110000000000', '000001110000000011111111', -1),
 ('000001111111100000000000', '000001111000000011111111', -1),
 ('000001111111100000000000', '000001110000000011111111', -1),
 ('111111110000000011111111', '111111111000000011111111', 1)]

In [33]:
len(dataset)

15

In [36]:
def bitwise_and(bit_str1, bit_str2):
    # Convert bit strings to integers
    int1 = int(bit_str1, 2)
    int2 = int(bit_str2, 2)

    # Perform bitwise AND operation
    result = int1 & int2

    # Convert result back to binary string
    result_str = bin(result)[2:]  # [2:] to remove '0b' prefix

    # Return result
    return result_str.zfill(max(len(bit_str1), len(bit_str2)))

In [37]:
bitwise_and(filters8[0], dataset[4][1])

'000001110000000000000000'

In [38]:
def hamming_distance(bit_str1, bit_str2):
    # Ensure both strings have the same length
    if len(bit_str1) != len(bit_str2):
        raise ValueError("Bit strings must have the same length")

    # Initialize distance counter
    distance = 0

    # Iterate over each pair of corresponding bits
    for bit1, bit2 in zip(bit_str1, bit_str2):
        if bit1 != bit2:
            distance += 1

    # Return Hamming distance
    return distance

In [39]:
hamming_distance(filters8[0], dataset[4][1])

13

In [48]:
def sumFilter(bitwise_and: str) -> int:
    sum = 0
    for i in bitwise_and:
        sum += int(i)
    return sum

In [50]:
import numpy as np

In [51]:
def h(x_a: str, x_b: str) -> int:
    sgn = np.sign(x_a * x_b)
    return sgn

In [52]:
# Initialize weights as uniform distribution
weights = []
weights = [1 / len(dataset)] * len(dataset)

In [53]:
def deltaDirac(h: int, y: int) -> int:
    if h == y:
        return 1
    else:
        return 0

In [54]:
errors = []

In [55]:
for filters_entry in filters:
    filters_list, thresholds = filters_entry
    for filter_str, threshold_list in zip(
        filters_list, [thresholds] * len(filters_list)
    ):
        for threshold in threshold_list:
            error = 0
            for pair in range(len(dataset)):
                x_a = sumFilter(bitwise_and(filter_str, dataset[pair][0])) - threshold
                x_b = sumFilter(bitwise_and(filter_str, dataset[pair][1])) - threshold
                filtered = h(x_a, x_b)
                errors.append(
                    (filter_str, threshold, deltaDirac(filtered, dataset[pair][2]))
                )
    print(errors)

[('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 0), ('111100000000000000000000', 0, 1), ('111100000000000000000000', 1, 1), ('111100000000000000000000', 1, 1), ('111100000000000000000000', 1, 1), ('111100000000000000000000', 1, 0), ('111100000000000000000000', 1, 0), ('111100000000000000000000', 1, 1), ('111100000000000000000000', 1, 1), ('111100000000000000000000', 1, 0), ('111100000000000000000000', 1, 0), ('111100000000000000000000', 1, 1), ('111100000000000000000000', 1, 0), ('111100000000000000000000', 1, 0), ('111100000000000000000000'

In [56]:
best_errors = min(errors, key=lambda x: x[2])
best_filter = best_errors[0]
best_threshold = best_errors[1]

In [57]:
print("best filter", best_filter, "\nbest threshold", best_threshold)

best filter 111100000000000000000000 
best threshold 0


In [59]:
def sign(number: int) -> int:
    if number < 0:
        return -1
    elif number >= 0:
        return 1

In [60]:
def weak_classifier(pair: tuple, threshold: int, filter: str) -> int:
    print(pair, threshold, filter)
    filtered1 = sumFilter(bitwise_and(pair[0], filter))
    filtered2 = sumFilter(bitwise_and(pair[1], filter))
    return sign((filtered1 - threshold) * (filtered2 - threshold))

In [61]:
def delta(prediction: int, ground_truth: int) -> int:
    if prediction != ground_truth:
        return 1
    else:
        return 0

In [62]:
def get_error(weigth: float, prediction: int, ground_truth: int) -> float:
    error = weigth * delta(prediction, ground_truth)

    print(
        "Weight",
        weigth,
        "\nPrediction",
        prediction,
        "\nGround Truth",
        ground_truth,
        "\nERROR:",
        error,
    )

    return error

In [63]:
# Input
dataset
filters
M = 10

# Initial weights
weights = np.ones(len(dataset)) / len(dataset)

# Errors per iteration
errors = {}

In [64]:
import math

In [65]:
for m in range(M):  # iterations
    for filters_entry in filters:  # for each filter
        filters_list, threshold_list = filters_entry
        for filter, thresholds in zip(
            filters_list, [threshold_list] * len(filters_list)
        ):
            for threshold in thresholds:  # for each threshold
                error = 0
                for pair in range(len(dataset)):  # for each pair
                    prediction = weak_classifier(dataset[pair][0:2], threshold, filter)
                    error += get_error(weights[pair], prediction, dataset[pair][2])
                    print("LABEL", dataset[pair][2])
                print("[!] ERROR", error)
                errors[(filter, threshold)] = error
        print("errors", errors)
    best_filter, best_threshold = min(errors, key=lambda k: abs(errors[k]))

    print("Best Filter:", best_filter)
    print("Best Threshold:", best_threshold)

    min_error = errors[(best_filter, best_threshold)]
    print(min_error)
    confidence = math.log(
        (1 - min_error) / min_error
    )  # confidence of the weak classifier
    print("Confidence:", confidence)

    # Asymmetric Weight Update
    for pair in range(len(dataset)):
        if dataset[pair][2] == +1:
            if (
                weak_classifier(dataset[pair][0:2], best_threshold, best_filter)
                != dataset[pair][2]
            ):
                weights[pair] = weights[pair] * math.exp(confidence)

    for pair in range(len(dataset)):
        if dataset[pair][2] == +1:
            weights[pair] = weights[pair] / sum(
                weights[pair] for pair in range(len(dataset)) if dataset[pair][2] == +1
            )

('000000001111111100000000', '000000011111111000000000') 0 111100000000000000000000
Weight 0.06666666666666667 
Prediction 1 
Ground Truth 1 
ERROR: 0.0
LABEL 1
('000000001111111100000000', '000000111111110000000000') 0 111100000000000000000000
Weight 0.06666666666666667 
Prediction 1 
Ground Truth 1 
ERROR: 0.0
LABEL 1
('000000001111111100000000', '000001111111100000000000') 0 111100000000000000000000
Weight 0.06666666666666667 
Prediction 1 
Ground Truth 1 
ERROR: 0.0
LABEL 1
('000000001111111100000000', '000001111000000011111111') 0 111100000000000000000000
Weight 0.06666666666666667 
Prediction 1 
Ground Truth -1 
ERROR: 0.06666666666666667
LABEL -1
('000000001111111100000000', '000001110000000011111111') 0 111100000000000000000000
Weight 0.06666666666666667 
Prediction 1 
Ground Truth -1 
ERROR: 0.06666666666666667
LABEL -1
('000000011111111000000000', '000000111111110000000000') 0 111100000000000000000000
Weight 0.06666666666666667 
Prediction 1 
Ground Truth 1 
ERROR: 0.0
LABEL 

  (1 - min_error) / min_error


1 
ERROR: 0.0
LABEL 1
('000000001111111100000000', '000000111111110000000000') 4 000000000000111100000000
Weight 0.1521934963481843 
Prediction 1 
Ground Truth 1 
ERROR: 0.0
LABEL 1
('000000001111111100000000', '000001111111100000000000') 4 000000000000111100000000
Weight 0.13308268214839686 
Prediction 1 
Ground Truth 1 
ERROR: 0.0
LABEL 1
('000000001111111100000000', '000001111000000011111111') 4 000000000000111100000000
Weight 0.06666666666666667 
Prediction 1 
Ground Truth -1 
ERROR: 0.06666666666666667
LABEL -1
('000000001111111100000000', '000001110000000011111111') 4 000000000000111100000000
Weight 0.06666666666666667 
Prediction 1 
Ground Truth -1 
ERROR: 0.06666666666666667
LABEL -1
('000000011111111000000000', '000000111111110000000000') 4 000000000000111100000000
Weight 0.12070997766873139 
Prediction 1 
Ground Truth 1 
ERROR: 0.0
LABEL 1
('000000011111111000000000', '000001111111100000000000') 4 000000000000111100000000
Weight 0.11192812608668681 
Prediction 1 
Ground Truth

In [66]:
weights

array([0.21585288, 0.17008903, 0.14595941, 0.06666667, 0.06666667,
       0.13062344, 0.11988164, 0.06666667, 0.06666667, 0.11188857,
       0.06666667, 0.06666667, 0.06666667, 0.06666667, 0.10568903])

In [67]:
print("Best Filter:", best_filter)
print("Best Threshold:", best_threshold)
print("Min error", min_error)

Best Filter: 000000001111000000000000
Best Threshold: 2
Min error 0.0
