In [32]:
import pandas as pd

In [16]:
########################################################
###         HELPER FUNCTIONS FOR LABELING            ###
########################################################

def assign_destination(destination: str) -> int:
    if destination == "ff:ff:ff:ff:ff:ff":
        return 1
    else:
        return 0


def check_null_value(string_to_check: str) -> str:
    if pd.isna(string_to_check):
        return ""
    else:
        return string_to_check


def extract_rssi(rssi_string: str) -> int:
    rssi_value = rssi_string.replace(" dBm", "")
    return int(rssi_value)


def assign_device(device: str, source: str) -> str:
    # The source "e4:22:a6:0d:7c:4b" is uniquely assigned to the buds and was extensively tested
    if device == "empty" and source == "e4:22:a6:0d:7c:4b":
        return "Buds"
    elif device in ["Phone", "Phone Scan", "Phone Google"]:
        return "Phone"
    else:
        return device


def encode_company_id(company: str, source: str, service_data: str) -> str:
    # Defined, but not named company identifiers f.e. 0x34f5
    if str(company).startswith("0x") and len(company) == 6:
        return "Unknown"
    # Assign the Buds "Anonymous" source the company Samsung
    elif pd.isna(company) and source == "Anonymous":
        return "Samsung Electronics Co. Ltd."
    elif pd.isna(company) and str(service_data).startswith("4a17235"):
        return "Samsung Electronics Co. Ltd."
    elif pd.isna(company):
        return "Undefined"
    else:
        return company


def extract_time(delta_string: str) -> int:
    if pd.isna(delta_string):
        return 0
    else:
        return int(delta_string.replace("\\302\\265s", ""))


def check_company_id_existence(company_id: str) -> int:
    if company_id == "Undefined":
        return 0
    else:
        return 1


def extract_length(entry_to_exctract: str) -> int:
    if pd.isna(entry_to_exctract):
        return 0
    else:
        return len(entry_to_exctract)


def is_adv_channel(channel: int) -> bool:
    if channel in [37, 38, 39]:
        return True
    else:
        return False


def check_entry_existence(entry_to_check: str) -> int:
    if pd.isna(entry_to_check):
        return 0
    else:
        return 1

In [17]:
# Read the input CSV file and rename some columns
def read_data_to_df(file_name: str) -> pd.DataFrame:
    dataset_df = pd.read_csv(file_name, encoding='ISO-8859-1')
    a = dataset_df.columns
    dataset_df.columns = dataset_df.columns.str.replace("Packet time (start to end)", "packet_start_end")
    dataset_df.columns = dataset_df.columns.str.replace("Delta time (end to start)", "delta_end_start")
    dataset_df.columns = dataset_df.columns.str.replace("Delta time (start to start)", "delta_start_start")
    dataset_df.columns = dataset_df.columns.str.replace(" ", "_")
    dataset_df.columns = dataset_df.columns.str.replace("Test", "Device")
    return dataset_df

def initialize_source_dictionaries(all_unique_sources: list) -> None:
    for source in all_unique_sources:
        if str(source) != "nan":
            source_dictionaries[source] = {}
            source_dictionaries[source]["count"] = 0
            source_dictionaries[source]["malformed_count"] = 0
            source_dictionaries[source]["highest_rssi"] = 0
            source_dictionaries[source]["lowest_rssi"] = -100
            source_dictionaries[source]["first_occurence"] = -1
            source_dictionaries[source]["last_occurence"] = 0
            source_dictionaries[source]["device"] = "empty"
            source_dictionaries[source]["sub_device"] = ""

In [20]:
# Assign values for each row with either simple transformations or direct value usage
def fill_labelled_columns(packet_data: tuple, device: str, sub_device: str):

    row_data = {
        "time": packet_data.Time,
        "source": packet_data.Source,
        "destination": packet_data.Destination,
        "is_broadcast": assign_destination(packet_data.Destination),
        "length": packet_data.Length,
        "info": packet_data.Info,
        "rssi": extract_rssi(packet_data.RSSI),
        "company_id": encode_company_id(packet_data.Company_ID, packet_data.Source, packet_data.Service_Data),
        "has_company_id": check_company_id_existence(encode_company_id(packet_data.Company_ID, packet_data.Source, packet_data.Service_Data)),
        "channel": packet_data.Channel,
        "is_adv_channel": is_adv_channel(packet_data.Channel),
        "device_name": check_null_value(packet_data.Device_Name),
        "uuid16": check_null_value(packet_data.UUID16),
        "has_uuid16": check_entry_existence(packet_data.UUID16),
        "len_uuid16": extract_length(packet_data.UUID16),
        "uuid128": check_null_value(packet_data.UUID128),
        "has_uuid128": check_entry_existence(packet_data.UUID128),
        "data": check_null_value(packet_data.Data),
        "len_data": extract_length(packet_data.Data),
        "ad_type": check_null_value(packet_data.AD_Type),
        "len_ad_type": extract_length(packet_data.Data),
        "service_data": check_null_value(packet_data.Service_Data),
        "len_service_data": extract_length(packet_data.Service_Data),
        "crc": check_null_value(packet_data.CRC),
        "labelled_device": assign_device(device, packet_data.Source),
        "sublabel_device": sub_device,
        "time_start_end": extract_time(packet_data.packet_start_end),
        "delta_end_start": extract_time(packet_data.delta_end_start),
        "delta_start_start": extract_time(packet_data.delta_start_start)
    }

    # To ensure labeling quality, phone-based labels which do not a valid company_id are labeled empty
    # Valid company_ids include: ["Samsung Electronics Co. Ltd.", "Undefined", "Unknown"]
    # Invalid company_ids include: Any other company such as "Apple", "Conneqtech B.V." etc.
    if row_data["labelled_device"] in ['Phone'] and row_data["company_id"] not in phone_companies:
        row_data["labelled_device"] = "empty"
        row_data["sublabel_device"] = "empty"

    return row_data


In [22]:
# Main raw dataset input parsing method
def parse_dataframe(input_dataframe: pd.DataFrame):
    # Define some helper variables for tracking of outliers
    nan_discarded_counter = 0
    malformed_discarded_counter = 0
    smart_tag_list = []
    buds_list = []
    phone_list = []
    phone_data = []

    for row in input_dataframe.itertuples(index=False):
        # Skip the source "nan" and keep track of packets
        if str(row.Source) == "nan":
            nan_discarded_counter += 1
            continue

        # Any source != "nan" is valid, therefore we can increase the counter of source packets here
        source_dictionaries[row.Source]["count"] += 1

        # Skip malformed packets and keep track of occurrences
        if "Malformed Packet" in row.Info:
            source_dictionaries[row.Source]["malformed_count"] += 1
            malformed_discarded_counter += 1
            continue

        current_rssi = int(row.RSSI.split(" ")[0])

        # Assign and overwrite source dictionary values with the current row
        if source_dictionaries[row.Source]["first_occurence"] == -1:
            source_dictionaries[row.Source]["first_occurence"] = row.Time
        if row.Time > source_dictionaries[row.Source]["last_occurence"]:
            source_dictionaries[row.Source]["last_occurence"] = row.Time
        if current_rssi > source_dictionaries[row.Source]["lowest_rssi"]:
            source_dictionaries[row.Source]["lowest_rssi"] = current_rssi
        if current_rssi < source_dictionaries[row.Source]["highest_rssi"]:
            source_dictionaries[row.Source]["highest_rssi"] = current_rssi

        # Label conditions for the buds
        if row.Source == "Anonymous":
            buds_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Buds"
            source_dictionaries[row.Source]["sub_device"] = "Buds"
        if row.Device_Name == "Joel's Buds2 Pro" or row.Device_Name == "Buds2 Pro":
            buds_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Buds"
            source_dictionaries[row.Source]["sub_device"] = "Buds"

        # Label conditions for the smart tag
        if row.Device_Name == "Smart Tag":
            smart_tag_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Smart Tag"
            source_dictionaries[row.Source]["sub_device"] = "Smart Tag"
        
        # Label conditions for the phone
        if row.UUID16 == "Google LLC" and row.Length == 63:
            source_dictionaries[row.Source]["device"] = "Phone"
            source_dictionaries[row.Source]["sub_device"] = "Phone Google"
        if str(row.Data).startswith("021861"):
            phone_list.append(row.Source)
            phone_data.append(row.Data)
            source_dictionaries[row.Source]["device"] = "Phone"
            source_dictionaries[row.Source]["sub_device"] = "Phone Phone"
        if row.Info == "SCAN_REQ" and row.Length == 38:
            source_dictionaries[row.Source]["device"] = "Phone"
            source_dictionaries[row.Source]["sub_device"] = "Phone Scan"

        # After assigning all values, create a transformed row for the final list of rows
        row_data = fill_labelled_columns(row, source_dictionaries[row.Source]["device"], source_dictionaries[row.Source]["sub_device"])
        final_list.append(row_data)

    return nan_discarded_counter, malformed_discarded_counter

In [23]:
def find_empty_and_malformed_sources():
    only_malformed_sources = []
    empty_sources = []

    for key in source_dictionaries.keys():
        count = source_dictionaries[key]["count"]
        malformed_count = source_dictionaries[key]["malformed_count"]
        device = source_dictionaries[key]["device"]

        # If there are only malformed packets, a source is "malformed-only"
        if (count - malformed_count) == 0:
            only_malformed_sources.append(key)
        
        # Get all sources which are not part of the target group (and contain at least 1 valid packet)
        if (count - malformed_count) != 0 and device == "empty":
            empty_sources.append(key)

    print(f"Only Malformed Sources: {len(only_malformed_sources)}")
    print(f"Empty Sources: {len(empty_sources)}")

    return only_malformed_sources, empty_sources

# Recursively reassign sources due to non-deterministic BLE packet contents
# --> Extensively described in the thesis
def reassign_sources():
    empty_occurrences = 0
    new_final_list = []

    for row_dict in final_list:
        if row_dict["labelled_device"] == "empty" and row_dict["company_id"] in phone_companies:
            current_source = row_dict["source"]
            row_dict["labelled_device"] = source_dictionaries[current_source]["device"]

        if row_dict["labelled_device"] != "empty":
            new_final_list.append(row_dict)
        else:
            new_final_list.append(row_dict)
            empty_occurrences += 1

    return empty_occurrences, new_final_list


In [34]:
# In case of several label assignments for source (may happen rarely), assign the source its majority label
# For example: A source has 700 packets labeled "Phone" and 3 packets labeled "empty" --> all packets get assigned "Phone"
def get_majority_label(group):
    majority_label = group['labelled_device'].mode()[0]
    group['labelled_device'] = majority_label
    return group

# Create the final dataframe based on the final_list rows and make sure all sources only have 1 label
def create_labelled_dataframe(entry_list: list):
    new_df = pd.DataFrame(data=entry_list, columns=list((final_list[0].keys())))
    relabeled_dataframe = new_df.groupby('source').apply(get_majority_label).reset_index(drop=True)
    return relabeled_dataframe

# Write the dataframe to a file
def write_new_dataframe(file_to_write: str, df_to_write: pd.DataFrame):
    df_to_write.to_csv(file_to_write, encoding='utf-8', index=False)
    print(f"New CSV written as: {file_to_write}")

# Extract information about the sources of a label
def info_extractor(df_input: pd.DataFrame, wanted_label: str):
    seen_sources = {}
    for row in df_input.itertuples(index=False):
        if row.source not in seen_sources.keys() and row.sublabel_device == wanted_label:
            seen_sources[row.source] = {}
            seen_sources[row.source]["first"] = row.time
            seen_sources[row.source]["last"] = row.time
            seen_sources[row.source]["count"] = source_dictionaries[row.source]["count"]
            continue
        if row.source in seen_sources.keys() and row.sublabel_device == wanted_label:
            if seen_sources[row.source]["last"] < row.time:
                seen_sources[row.source]["last"] = row.time

    return seen_sources


In [35]:
########################################################
###                   MAIN SCRIPT                    ###
########################################################

# Global variables
final_list = []
source_dictionaries = {}
phone_companies = ["Samsung Electronics Co. Ltd.", "Undefined", "Unknown"]
undefined_list = []

# Setup all lists, dictionaries and the labeled dataframe
raw_dataframe = read_data_to_df("../data/interference_apartement_6h_4.2.csv")
all_sources = raw_dataframe["Source"].unique()
initialize_source_dictionaries(all_sources)
nan_counter, malformed_counter = parse_dataframe(raw_dataframe)
malformed_list, empty_list = find_empty_and_malformed_sources()
empty_counter, filtered_list = reassign_sources()
labelled_dataframe = create_labelled_dataframe(filtered_list)
relevant_sources = labelled_dataframe["source"].unique()
print(labelled_dataframe["sublabel_device"].value_counts())

# Extract all labels based on sources
phone_google_sources = info_extractor(labelled_dataframe, "Phone Google")
phone_scan_sources = info_extractor(labelled_dataframe, "Phone Scan")
phone_phone_sources = info_extractor(labelled_dataframe, "Phone Phone")
phone_sources = phone_google_sources | phone_scan_sources | phone_phone_sources
bud_sources = info_extractor(labelled_dataframe, "Buds")
smart_tag_sources = info_extractor(labelled_dataframe, "Smart Tag")

# For the phone sources, labeling needs to be closely inspected and adjusted
# Therefore, each source is filtered according to some criteria and re-labeled if needed
for label in ["Phone Scan", "Phone Google", "Phone Phone"]:
    actual_phone_sources = []
    for source in phone_sources.keys():
        first = round(phone_sources[source]["first"], 3)
        last = round(phone_sources[source]["last"], 3)
        count = phone_sources[source]["count"]
        max_rssi = source_dictionaries[source]["highest_rssi"]
        low_rssi = source_dictionaries[source]["lowest_rssi"]

        # Find Phone Google labeled sources
        if label == "Phone Google" and source_dictionaries[source]["sub_device"] == label:
            # These conditions are dependent on the dataset
            # For the Apartment Dataset 1+3 and University Datset 1 --> max_rssi >= -70
            # For the Apartment Dataset 2 --> max_rssi >= -80
            if count > 30 and max_rssi >= -70:
                print(f"Source: {source}, First: {first}, Last: {last}, Count: {count}, Highest RSSI:{max_rssi}, Lowest RSSI:{low_rssi} Label: {label}")
                actual_phone_sources.append(source)

        # Find Phone Scan labeled sources
        if label == "Phone Scan" and source_dictionaries[source]["sub_device"] == label:
            # These conditions are dependent on the dataset
            # For the Apartment Dataset 1+3 and University Datset 1 --> max_rssi >= -40
            # For the Apartment Dataset 2 --> max_rssi >= -58
            if max_rssi >= -40:
                print(f"Source: {source}, First: {first}, Last: {last}, Count: {count}, Highest RSSI:{max_rssi}, Lowest RSSI:{low_rssi} Label: {label}")
                actual_phone_sources.append(source)

        # Find Phone Phone labeled sources
        if label == "Phone Phone" and source_dictionaries[source]["sub_device"] == label:
            # These conditions are dependent on the dataset
            # For the Apartment Dataset 1+3 and University Datset 1 --> low_rssi >= -50
            # For the Apartment Dataset 2 --> low_rssi >= -65
            if low_rssi >= -50:
                # Only consider packets with count > 3 because otherwise a lot of potentially noisy data is introduced
                # Sacrifice some minor labeling accuracy for increased dataset quality
                if count > 3:
                    print(f"Source: {source}, First: {first}, Last: {last}, Count: {count}, Highest RSSI:{max_rssi}, Lowest RSSI:{low_rssi} Label: {label}")
                    pass
                actual_phone_sources.append(source)

    # Reassign the labels based on the above conditions
    current_phone_sources = labelled_dataframe[labelled_dataframe['sublabel_device'] == label]["source"]
    diff_sources = set(current_phone_sources) - set(actual_phone_sources)
    labelled_dataframe.loc[labelled_dataframe['source'].isin(diff_sources), 'labelled_device'] = 'empty'

# Print information about dataframe length
print("--------------------------------------------------")
print("FINAL DATAFRAME\n")
print(f"Length Raw Dataframe: {len(raw_dataframe)}")
print(f"Length Final Dataframe: {len(labelled_dataframe)}")
print(f"Removed: {malformed_counter+nan_counter}, Diff: {len(raw_dataframe)-len(labelled_dataframe)}")

# Uncomment the line below to write the file
# write_new_dataframe("int_uni_1.csv", labelled_dataframe)


Only Malformed Sources: 25836
Empty Sources: 15933


  relabeled_dataframe = new_df.groupby('source').apply(get_majority_label).reset_index(drop=True)


sublabel_device
                1353459
Buds             673585
Phone Phone      337002
Phone Scan       129410
Phone Google      51472
Smart Tag         26089
empty               110
Name: count, dtype: int64
Source: 41:36:ff:e3:79:d4, First: 14304.659, Last: 14626.201, Count: 131, Highest RSSI:-25, Lowest RSSI:-22 Label: Phone Scan
Source: 42:68:98:3c:fa:ad, First: 6053.368, Last: 6737.097, Count: 437, Highest RSSI:-25, Lowest RSSI:-22 Label: Phone Scan
Source: 43:15:33:00:7a:0f, First: 11313.76, Last: 11819.429, Count: 277, Highest RSSI:-25, Lowest RSSI:-22 Label: Phone Scan
Source: 45:c5:15:26:f1:f7, First: 1916.491, Last: 2263.35, Count: 288, Highest RSSI:-24, Lowest RSSI:-22 Label: Phone Scan
Source: 46:61:f0:f7:18:00, First: 20071.958, Last: 20411.952, Count: 134, Highest RSSI:-25, Lowest RSSI:-22 Label: Phone Scan
Source: 47:86:7d:93:45:6d, First: 17648.596, Last: 18347.96, Count: 331, Highest RSSI:-25, Lowest RSSI:-22 Label: Phone Scan
Source: 48:19:7a:41:0a:97, First: 17302.3

In [31]:
print(len(final_list))

12855635
