# Test Information Elements (IE) SUM Clustering

In [1]:
import ast
import os

import numpy as np
import pandas as pd
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import (completeness_score, homogeneity_score,
                             v_measure_score)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


## Importing Datasets

In [2]:
string_df = pd.read_csv("../../data/interim/string_df.csv")

combinations_df = pd.read_csv("../../data/train_test/10_combinations_df.csv", index_col=0)

In [3]:
def collect_csvs_and_concatenate(directory):
    df_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.csv'):
                csv_path = os.path.join(root, file)
                try:
                    df = pd.read_csv(csv_path)
                    df_list.append(df)
                except Exception as e:
                    print(f"Error reading {csv_path}: {e}")
    
    # Concatenate all dataframes
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

directory = "../../data/extracted"
combined_df = collect_csvs_and_concatenate(directory)


In [4]:
columns_to_drop = ['SSID', 'Supported Rates', 'Extended Supported Rates', 'VHT Capabilities', 'HE Capabilities', 'Length']
combined_df['Timestamp'] = pd.to_datetime(combined_df["Timestamp"], unit="s")
combined_df.drop(columns=columns_to_drop, inplace=True)

# Recreate Features

In [5]:
def arrival_order_gen(df):
    # Step 1: Create a copy of the input DataFrame
    df_arrival = df.copy()

    # Step 2: Sort the DataFrame by 'Timestamp' to ensure the correct order
    df_arrival.sort_values(by="Timestamp", inplace=True)

    # Step 3: Group by 'MAC' and calculate the sum of 'DSChannel' values for each MAC
    mac_sum_ds_channel = df_arrival.groupby("MAC Address")["DS Channel"].sum().reset_index()
    mac_sum_channel = df_arrival.groupby("MAC Address")["Channel"].sum().reset_index()

    # Step 4: Create new columns 'DSArrivalOrder' and 'ArrivalOrder' to represent the sequence of 'DSChannel' and 'Channel' values for each MAC
    mac_ds_order_of_arrival = {}
    mac_order_of_arrival = {}
    for mac in tqdm(df_arrival["MAC Address"].unique()):
        mac_ds_order_of_arrival[mac] = df_arrival[df_arrival["MAC Address"] == mac][
            "DS Channel"
        ].tolist()
        mac_order_of_arrival[mac] = df_arrival[df_arrival["MAC Address"] == mac][
            "Channel"
        ].tolist()

    mac_sum_ds_channel["DSArrivalOrder"] = mac_sum_ds_channel["MAC Address"].map(
        mac_ds_order_of_arrival
    )

    mac_sum_channel["ArrivalOrder"] = mac_sum_channel["MAC Address"].map(mac_order_of_arrival)

    # Merge the DataFrames
    df = pd.merge(
        df, mac_sum_ds_channel[["MAC Address", "DSArrivalOrder"]], on="MAC Address", how="left"
    )
    df = pd.merge(df, mac_sum_channel[["MAC Address", "ArrivalOrder"]], on="MAC Address", how="left")

    # Return the processed DataFrame
    return df

In [6]:
df = arrival_order_gen(combined_df)

  0%|          | 0/4795 [00:00<?, ?it/s]

In [7]:
df.fillna(-1, inplace=True)

In [8]:
unused_features = ['Channel', 'DS Channel']

In [9]:
df.drop(columns=unused_features, inplace=True)

Multi Channel Arrival Order column

In [10]:
df["MCAO"] = df.apply(
    lambda row: [a for a, b in zip(row["ArrivalOrder"], row["DSArrivalOrder"])],
    axis=1,
)

In [11]:
def sum_ascii_from_hex(df, columns):
    def hex_to_ascii_sum(value):
        # Ensure the value is treated as a string
        hex_string = str(value)
        try:
            # Convert hex string to bytes, then to ASCII characters, and calculate their sum
            return sum(ord(chr(int(hex_string[i:i+2], 16))) for i in range(0, len(hex_string), 2))
        except ValueError:
            # Handle invalid hex strings
            return None
    
    # Apply the function to each column specified in the list
    for column in columns:
        df[column] = df[column].apply(hex_to_ascii_sum)
    
    return df

In [12]:
sum_df = sum_ascii_from_hex(df, ["HT Capabilities", 'Extended Capabilities', 'Vendor Specific Tags'])

In [13]:
df = sum_df.copy()

In [14]:
df

Unnamed: 0,Timestamp,MAC Address,HT Capabilities,Extended Capabilities,Vendor Specific Tags,Label,DSArrivalOrder,ArrivalOrder,MCAO
0,2023-05-20 13:52:01.864465952,d2:6b:aa:b5:fb:ed,327.0,134.0,,iPhone12Pro_C,"[1.0, 1.0, 6.0, 6.0, 11.0, 11.0, 13.0, 13.0, 1...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
1,2023-05-20 13:52:01.884716034,d2:6b:aa:b5:fb:ed,327.0,134.0,,iPhone12Pro_C,"[1.0, 1.0, 6.0, 6.0, 11.0, 11.0, 13.0, 13.0, 1...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
2,2023-05-20 13:52:01.910542011,d2:6b:aa:b5:fb:ed,327.0,134.0,,iPhone12Pro_C,"[1.0, 1.0, 6.0, 6.0, 11.0, 11.0, 13.0, 13.0, 1...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
3,2023-05-20 13:52:01.930788994,d2:6b:aa:b5:fb:ed,327.0,134.0,,iPhone12Pro_C,"[1.0, 1.0, 6.0, 6.0, 11.0, 11.0, 13.0, 13.0, 1...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
4,2023-05-20 13:52:01.968745947,d2:6b:aa:b5:fb:ed,327.0,134.0,,iPhone12Pro_C,"[1.0, 1.0, 6.0, 6.0, 11.0, 11.0, 13.0, 13.0, 1...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11...","[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
...,...,...,...,...,...,...,...,...,...
76699,2021-06-16 12:51:00.708627939,ec:9b:f3:75:8e:40,649.0,160.0,377.0,SamsungS6_H,"[1.0, 1.0, 4.0, 4.0, 6.0, 6.0, 7.0, 7.0, 7.0, ...","[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,...","[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,..."
76700,2021-06-16 12:51:00.732495070,ec:9b:f3:75:8e:40,649.0,160.0,377.0,SamsungS6_H,"[1.0, 1.0, 4.0, 4.0, 6.0, 6.0, 7.0, 7.0, 7.0, ...","[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,...","[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,..."
76701,2021-06-16 12:51:00.752908945,ec:9b:f3:75:8e:40,649.0,160.0,377.0,SamsungS6_H,"[1.0, 1.0, 4.0, 4.0, 6.0, 6.0, 7.0, 7.0, 7.0, ...","[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,...","[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,..."
76702,2021-06-16 12:51:00.776777029,ec:9b:f3:75:8e:40,649.0,160.0,377.0,SamsungS6_H,"[1.0, 1.0, 4.0, 4.0, 6.0, 6.0, 7.0, 7.0, 7.0, ...","[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,...","[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,..."


## Normalization

In [15]:
label_column = df["Label"]
df_hdbscan_columns = df[["Extended Capabilities", "Vendor Specific Tags"]]
df_hdbscan_columns.columns = df_hdbscan_columns.columns.astype(str)

# Apply the MinMaxScaler
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(
    scaler.fit_transform(df_hdbscan_columns),
    columns=df_hdbscan_columns.columns,
    index=df.index,
)

# Add the 'Label' column back in
df_normalized_labels = pd.concat([df_normalized, label_column], axis=1)

In [16]:
def similarity_matrix(df, column_name):
    # Define a function to pad arrays with zeros
    def pad_array_with_zeros(arr, max_length):
        return [0] * (max_length - len(arr)) + arr

    # Find the maximum length of arrays in the column
    max_length_ds = df[column_name].apply(len).max()

    # Pad all arrays in the column with zeros
    df[column_name] = df[column_name].apply(
        lambda x: pad_array_with_zeros(x, max_length_ds)
    )

    # Create a matrix where each row corresponds to a list in the specified column
    matrix_ds = np.array(df[column_name].tolist())

    # Calculate cosine similarity between all pairs of lists using vectorization
    cosine_similarity_matrix_ds = cosine_similarity(matrix_ds, matrix_ds)

    return cosine_similarity_matrix_ds

In [17]:
df_normalized_labels["MCAO"] = df["MCAO"]

In [18]:
df_normalized_labels.fillna(0, inplace=True)

In [19]:
df_normalized_labels

Unnamed: 0,Extended Capabilities,Vendor Specific Tags,Label,MCAO
0,0.307870,0.000000,iPhone12Pro_C,"[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
1,0.307870,0.000000,iPhone12Pro_C,"[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
2,0.307870,0.000000,iPhone12Pro_C,"[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
3,0.307870,0.000000,iPhone12Pro_C,"[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
4,0.307870,0.000000,iPhone12Pro_C,"[1, 1, 6, 6, 11, 11, 11, 11, 1, 1, 6, 6, 1, 11..."
...,...,...,...,...
76699,0.368056,0.036504,SamsungS6_H,"[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,..."
76700,0.368056,0.036504,SamsungS6_H,"[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,..."
76701,0.368056,0.036504,SamsungS6_H,"[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,..."
76702,0.368056,0.036504,SamsungS6_H,"[1, 1, 6, 6, 6, 6, 11, 6, 6, 6, 6, 11, 11, 11,..."


In [20]:
df_normalized.dropna(inplace=True)

## Cosine Similarity Matrix

In [21]:
def similarity_matrix(df, column_name):
    # Define a function to pad arrays with zeros
    def pad_array_with_zeros(arr, max_length):
        return [0] * (max_length - len(arr)) + arr

    # Find the maximum length of arrays in the column
    max_length_ds = df[column_name].apply(len).max()

    # Pad all arrays in the column with zeros
    df[column_name] = df[column_name].apply(
        lambda x: pad_array_with_zeros(x, max_length_ds)
    )

    # Create a matrix where each row corresponds to a list in the specified column
    matrix_ds = np.array(df[column_name].tolist())

    # Calculate cosine similarity between all pairs of lists using vectorization
    cosine_similarity_matrix_ds = cosine_similarity(matrix_ds, matrix_ds)

    return cosine_similarity_matrix_ds

## Clustering

In [23]:
def str_to_list(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return s 

In [24]:
df_normalized_labels = df_normalized_labels.fillna(0)

In [25]:
clusterer = HDBSCAN(
        gen_min_span_tree=True,
        min_samples=50,
        min_cluster_size=350,
        cluster_selection_method="eom",
        metric="manhattan"
    )

In [26]:


# Initialize results storage
results = []
results_pintor = []

# Set a random seed for reproducibility
np.random.seed(42)

# Iterate over the combinations_df
for index, row in tqdm(combinations_df.iterrows(), desc="Processing Combinations", total=combinations_df.shape[0]):
    labels_combination = str_to_list(index)  # Convert index to a list of labels if necessary
    length = row['length']  # Use length if needed in your logic

    # BACCICHET 

    df_subset = df_normalized_labels[df_normalized_labels["Label"].isin(labels_combination)]

    df_subset["precluster"] = clusterer.fit_predict(df_subset.drop(columns=["Label", "MCAO"]))

    for label in df_subset["precluster"].unique():
        if label == -1: # if noise, it remains noise
            df_subset.loc[df_subset["precluster"] == label, "cluster"] = "-1"
            continue

        df_subset.fillna(0, inplace=True)

        arrival_similarity = []
        arrival_similarity = similarity_matrix(df_subset[df_subset["precluster"] == label], "MCAO")
        
        threshold = 0.4 + 0.6 * (1 - np.mean(arrival_similarity))

        if len(arrival_similarity) >= 10:
            distortions = []
            K = range(1, 5)
            for k in K:
                kmeanModel = KMeans(n_clusters=k)
                kmeanModel.fit(arrival_similarity)
                distortions.append(kmeanModel.inertia_)

            distortions_diff_norm = np.gradient(distortions) / np.gradient(distortions).sum()
            sum_of_elements = 0
            for k, value in enumerate(distortions_diff_norm):
                sum_of_elements += value
                if sum_of_elements >= threshold:
                    break
            k = k + 1

            kmeanModel = KMeans(n_clusters=k)
            kmeanModel.fit(arrival_similarity)

            cluster_labels = [f"{label}_Cluster_{i}" for i in range(k)]
            cluster_indices = df_subset[df_subset["precluster"] == label].index
            for i in range(len(cluster_indices)):
                df_subset.loc[cluster_indices[i], "cluster"] = cluster_labels[kmeanModel.labels_[i]]
        else:
            df_subset.loc[df_subset["precluster"] == label, "cluster"] = "-1"

    # PINTOR clustering with DBSCAN

    df_subset_pintor = df_normalized_labels[df_normalized_labels["Label"].isin(labels_combination)]

    clusterer_pintor = DBSCAN(eps=0.01, min_samples=8).fit(df_subset_pintor.drop(columns=["Label", "MCAO"]))

    df_subset_pintor["cluster"] = clusterer_pintor.labels_

    # Compute clustering metrics

    homogeneity = homogeneity_score(df_subset["Label"], df_subset["cluster"])
    homogeneity_pintor = homogeneity_score(df_subset_pintor["Label"], df_subset_pintor["cluster"])

    v_measure = v_measure_score(df_subset["Label"], df_subset["cluster"])
    v_measure_pintor = v_measure_score(df_subset_pintor["Label"], df_subset_pintor["cluster"])

    completeness = completeness_score(df_subset["Label"], df_subset["cluster"])
    completeness_pintor = completeness_score(df_subset_pintor["Label"], df_subset_pintor["cluster"])

    no_clusters = len(np.unique(df_subset["cluster"])) - 1
    no_clusters_pintor = len(np.unique(df_subset_pintor["cluster"])) - 1

    no_noise = np.sum(np.array(df_subset["cluster"]) == -1, axis=0)
    no_noise_pintor = np.sum(np.array(df_subset_pintor["cluster"]) == -1, axis=0)

    # Append the results if clustering was performed
    results.append((labels_combination, homogeneity, v_measure, completeness, no_clusters, no_noise))
    results_pintor.append((labels_combination, homogeneity_pintor, v_measure_pintor, completeness_pintor, no_clusters_pintor, no_noise_pintor))

# Results processing can be done here, e.g., saving to file or further analysis.


Processing Combinations:   0%|          | 0/311 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["precluster"] = clusterer.fit_predict(df_subset.drop(columns=["Label", "MCAO"]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset.loc[df_subset["precluster"] == label, "cluster"] = "-1"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset_pintor["cluster"] = clusterer_pintor.l

In [29]:
results

[(('OppoFindX3Neo_A', 'GooglePixel3A_V'), 0.0, 0.0, 1.0, 0, 0),
 (('XiaomiRedmiNote7_S', 'XiaomiRedmi4_B', 'XiaomiA2_E'),
  0.9622477253416633,
  0.45300832396487933,
  0.29623516331274724,
  18,
  0),
 (('S21Ultra_M', 'OnePlusNord_O', 'HuaweiP20_G', 'iPhone7_F'),
  0.7326292583964433,
  0.7592364834233164,
  0.7878491541449986,
  3,
  0),
 (('HuaweiHonor9_R',
   'GooglePixel3A_V',
   'HuaweiP20_G',
   'SamsungS7_I',
   'XiaomiA2_E'),
  0.9317234878510654,
  0.8100162987264573,
  0.7164317762535158,
  7,
  0),
 (('iPhoneXSMax_M',
   'GooglePixel3A_V',
   'XiaomiRedmiNote7_S',
   'OnePlusNord_O',
   'iPhone11_M',
   'iPhone11_C'),
  0.8520171468605702,
  0.8383634014718632,
  0.8251403619433363,
  6,
  0),
 (('iPhone6_N',
   'XiaomiA2_E',
   'iPhoneXR_U',
   'XiaomiRedmiNote9S_T',
   'S21Ultra_M',
   'iPhone12_W',
   'GooglePixel3A_L'),
  0.9487415609442708,
  0.8437527195520296,
  0.7596850868341206,
  9,
  0),
 (('SamsungM31_A',
   'iPhone7_F',
   'iPhoneXSMax_M',
   'S21Ultra_M',
   

In [30]:
results_pintor

[(('OppoFindX3Neo_A', 'GooglePixel3A_V'),
  0.9999999999999999,
  0.7030824420891552,
  0.5421180689555349,
  3,
  1),
 (('XiaomiRedmiNote7_S', 'XiaomiRedmi4_B', 'XiaomiA2_E'),
  0.962245939324341,
  0.8927955049887875,
  0.8326954240179285,
  4,
  0),
 (('S21Ultra_M', 'OnePlusNord_O', 'HuaweiP20_G', 'iPhone7_F'),
  0.9999999999999994,
  0.8442004844396933,
  0.730403909219882,
  7,
  2),
 (('HuaweiHonor9_R',
   'GooglePixel3A_V',
   'HuaweiP20_G',
   'SamsungS7_I',
   'XiaomiA2_E'),
  0.789969246140137,
  0.7939185587899985,
  0.7979075576354432,
  5,
  0),
 (('iPhoneXSMax_M',
   'GooglePixel3A_V',
   'XiaomiRedmiNote7_S',
   'OnePlusNord_O',
   'iPhone11_M',
   'iPhone11_C'),
  0.8659092616341753,
  0.862187448590967,
  0.858497492487401,
  6,
  0),
 (('iPhone6_N',
   'XiaomiA2_E',
   'iPhoneXR_U',
   'XiaomiRedmiNote9S_T',
   'S21Ultra_M',
   'iPhone12_W',
   'GooglePixel3A_L'),
  0.877662186749304,
  0.8714845215212602,
  0.865393214806208,
  8,
  0),
 (('SamsungM31_A',
   'iPhone7

In [31]:
# Convert the results array to a DataFrame
results_df = pd.DataFrame(results, columns=['Labels Combination', 'Homogeneity', 'V-Measure', 'Completeness', 'No. of Clusters', 'No. of Noise'])

# Export the DataFrame to a CSV file
results_df.to_csv('../../reports/CSV/clustering-sota/results.csv', index=False)

In [32]:
# Convert the results array to a DataFrame
results_pintor_df = pd.DataFrame(results, columns=['Labels Combination', 'Homogeneity', 'V-Measure', 'Completeness', 'No. of Clusters', 'No. of Noise'])

# Export the DataFrame to a CSV file
results_pintor_df.to_csv('../../reports/CSV/clustering-sota/results_pintor.csv', index=False)