In [1]:
M = 16
selected_tau = 2.45

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


## Importing Datasets

In [4]:
string_df = pd.read_csv("../../data/interim/string_df.csv")

balanced_pairs_df = pd.read_csv("../../data/train_test/test_pairs.csv", index_col=0)

balanced_pairs_df.drop_duplicates(inplace=True)
balanced_pairs_df.reset_index(drop=True, inplace=True)

In [5]:
string_df

Unnamed: 0,label,concatenated
0,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
1,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
2,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
3,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
4,GooglePixel3A_L,0000000000000000000000000000000000000000000000...
...,...,...
951,iPhoneXSMax_M,0001101000101101000000000001101111111111000000...
952,iPhoneXSMax_M,0001101000101101010000000001101111111111000000...
953,iPhoneXSMax_M,0001101000101101000000000001101111111111000000...
954,iPhoneXSMax_M,0001101000101101010000000001101111111111000000...


In [6]:
string_df["concatenated"] = string_df["concatenated"].apply(
    lambda x: np.array(list(x)).astype(int)
)

## Importing Best Configurations

In [7]:
import re
import pandas as pd


def parse_log_file(filename):
    data = []

    with open(filename, "r") as file:
        lines = file.readlines()

        current_filter = None
        current_threshold = None
        current_min_error = None
        current_confidence = None

        for line in lines:
            if "Best Filter" in line:
                # Extract Best Filter using regex
                filter_match = re.search(r"Best Filter: (.+)", line)
                if filter_match:
                    current_filter = filter_match.group(1).strip()

            elif "Best Threshold" in line:
                # Extract Best Threshold using regex
                threshold_match = re.search(r"Best Threshold: (.+)", line)
                if threshold_match:
                    current_threshold = int(threshold_match.group(1).strip())

            elif "Min error" in line:
                # Extract Min Error using regex
                min_error_match = re.search(r"Min error: (.+)", line)
                if min_error_match:
                    current_min_error = float(min_error_match.group(1).strip())

            elif "Confidence" in line:
                # Extract Confidence using regex
                confidence_match = re.search(r"Confidence: (.+)", line)
                if confidence_match:
                    current_confidence = float(confidence_match.group(1).strip())

                    # Once we have all values, create a tuple and add it to the data list
                    data.append(
                        (
                            current_filter,
                            current_threshold,
                            current_min_error,
                            current_confidence,
                        )
                    )

                    # Reset current values for the next entry
                    current_filter = None
                    current_threshold = None
                    current_min_error = None
                    current_confidence = None

    # Convert the list of tuples into a DataFrame
    df = pd.DataFrame(
        data, columns=["Best Filter", "Best Threshold", "Min Error", "Confidence"]
    )

    return df

In [8]:
# Usage example:
filename = "../../reports/best_config"
best_configs_df = parse_log_file(filename)

In [9]:
best_configs_df

Unnamed: 0,Best Filter,Best Threshold,Min Error,Confidence
0,0[312] 🀫🀫🀫🀫🀫🀫🀫🀫 0[1464],2,0.466147,0.135618
1,0[0] 🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫 0[1768],6,0.466313,0.134951
2,0[328] 🀫🀫🀫🀫🀫🀫🀫🀫 0[1448],1,0.469633,0.12162
3,0[248] 🀫🀫🀫🀫🀫🀫🀫🀫 0[1528],1,0.467389,0.13063
4,0[8] 🀫🀫🀫🀫🀫🀫🀫🀫 0[1768],3,0.466297,0.135016
5,0[1768] 🀆🀆🀆🀆🀫🀫🀫🀫 0[8],1,0.471781,0.112996
6,0[1752] 🀫🀫🀫🀫🀫🀫🀫🀫 0[24],1,0.470016,0.120079
7,0[240] 🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫 0[1528],1,0.471137,0.115582
8,0[1760] 🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫 0[8],2,0.463862,0.144805
9,0[312] 🀆🀆🀆🀆🀫🀫🀫🀫 0[1464],1,0.464184,0.143508


In [10]:
if M != 0:
    best_configs_df = best_configs_df.head(M)

if M == 0:
    M = len(best_configs_df)

In [11]:
compression_rate = len(string_df["concatenated"].iloc[0]) / best_configs_df.shape[0]

print("Compression Rate:", compression_rate)

Compression Rate: 111.5


## Filters Parser

In [12]:
def filter_parser(input_string: str) -> list:
    # Split the string into its parts
    parts = input_string.split()

    # Initialize the final array
    result = []

    # Process each part
    for part in parts:
        if part.startswith("0["):
            # Extract the number inside the brackets
            count = int(part[2:-1])
            # Append the corresponding number of zeros to the result
            result.extend([0] * count)
        else:
            # Translate the tiles to their respective values
            for char in part:
                if char == "🀆":
                    result.append(-1)
                elif char == "🀫":
                    result.append(1)

    return result

## Apply Filters

In [13]:
def apply_filter(item, filter):
    # item = np.array(list(item)).astype(int)
    item = item.astype(int)
    filter = filter_parser(filter)
    return np.sum(np.multiply(item, filter))

In [14]:
def apply_filter_threshold(item, filter, threshold) -> int:
    if apply_filter(item, filter) > threshold:
        return 1
    else:
        return -1

In [15]:
def apply_filter_threshold_pair(item_1, item_2, filter, threshold) -> int:
    if apply_filter_threshold(item_1, filter, threshold) == apply_filter_threshold(
        item_2, filter, threshold
    ):
        return 1
    else:
        return -1

## Calculate Fingerprint

In [16]:
def hamming_distance(array1, array2, confidence):
    # Check if arrays have the same length
    if len(array1) != len(array2):
        raise ValueError("Arrays must have the same length")

    # Initialize distance counter
    distance = 0

    # Iterate through arrays and count differences
    for i in range(len(array1)):
        if array1[i] != array2[i]:
            distance += confidence[i]

    distance = (distance / sum(confidence)) * len(confidence)

    return distance

In [17]:
def calculate_fingerprint(item, best_filters, best_thresholds, confidence):
    fingerprint = []

    for best_filter, best_threshold in zip(best_filters, best_thresholds):
        filtered = np.sum(np.multiply(item.astype(int), filter_parser(best_filter)))

        if filtered > best_threshold:
            filtered = 1
        else:
            filtered = -1

        fingerprint.append(filtered)

    return fingerprint

In [18]:
fingerprints = []

In [19]:
for i, row in tqdm(string_df.iterrows(), total=string_df.shape[0]):
    # Extracting best filters and thresholds from the main DataFrame
    best_filters = best_configs_df["Best Filter"].tolist()
    best_thresholds = best_configs_df["Best Threshold"].tolist()
    confidence = best_configs_df["Confidence"].tolist()

    # Calculate the fingerprint using the relevant best filters and thresholds
    fingerprint = calculate_fingerprint(
        row["concatenated"], best_filters, best_thresholds, confidence
    )

    # Store the result in the 'fprint' column
    # string_df.at[i, "fprint"] = fingerprint
    fingerprints.append(fingerprint)

100%|██████████| 956/956 [00:02<00:00, 338.67it/s]


In [20]:
string_df["fprint"] = fingerprints

In [21]:
labels_to_remove = ['iPhone11_F', 'iPhone11_M', 'iPhoneXR_A', 'iPhoneXR_L', 'iPhone7_F', 'iPhone12_M', 'iPhone11_B', 'iPhone11_C', 'iPhone12Pro_C', 'S21Ultra_M', 'OppoFindX3Neo_A']
string_df = string_df[~string_df['label'].isin(labels_to_remove)]

In [22]:
string_df = string_df.reset_index(inplace=False, drop=True)

In [23]:
string_df


Unnamed: 0,label,concatenated,fprint
0,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, 1..."
1,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1,..."
2,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1, ..."
3,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1,..."
4,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, 1..."
...,...,...,...
738,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1..."
739,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, ..."
740,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, -..."
741,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 1, -1, -1, ..."


## Clustering w/ $\tau$

In [24]:
string_df.iloc[0, 2]

[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1]

In [25]:
hamming_distance(string_df.iloc[0, 2], string_df.iloc[1, 2], confidence) - selected_tau

-1.3168462331007678

In [26]:
def predict(string_1, string_2, tau):
    if hamming_distance(string_1, string_2, confidence) < tau:
        return 1
    else:
        return -1

In [27]:
predict(string_df.iloc[0, 2], string_df.iloc[100, 2], selected_tau)

-1

## Clustering

In [28]:
import numpy as np
import pandas as pd
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score

In [29]:
def hamming_distance(str1, str2):
    if len(str1) != len(str2):
        raise ValueError("Strings must be of the same length")
    return sum(c1 != c2 for c1, c2 in zip(str1, str2))

def predict(string_1, string_2, tau):
    return 1 if hamming_distance(string_1, string_2) < tau else -1

In [30]:
def cluster_dataset(string_df, tau):
    n = len(string_df)
    clusters = np.full(n, -1)  # -1 indicates that the item hasn't been clustered yet
    next_cluster_id = 0
    
    for i in range(n):
        if clusters[i] == -1:  # If not yet clustered
            clusters[i] = next_cluster_id
            for j in range(i + 1, n):
                if predict(string_df['fprint'][i], string_df['fprint'][j], tau) == 1:
                    clusters[j] = next_cluster_id
            next_cluster_id += 1
    
    return clusters


In [31]:
clusters = cluster_dataset(string_df, selected_tau)

# Ground truth labels
true_labels = string_df['label'].to_numpy()

# Calculate metrics
homogeneity = homogeneity_score(true_labels, clusters)
completeness = completeness_score(true_labels, clusters)
v_measure = v_measure_score(true_labels, clusters)

print(f"Homogeneity: {homogeneity}")
print(f"Completeness: {completeness}")
print(f"V-Measure: {v_measure}")

Homogeneity: 0.8236678560994436
Completeness: 0.7734837523133717
V-Measure: 0.7977873867952033


In [32]:
string_df['cluster'] = clusters

In [33]:
string_df

Unnamed: 0,label,concatenated,fprint,cluster
0,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, 1...",0
1,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1,...",0
2,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1, ...",0
3,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1,...",0
4,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, 1...",0
...,...,...,...,...
738,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1...",20
739,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, ...",19
740,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, -...",24
741,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 1, -1, -1, ...",20


In [34]:
print('RMSE:', np.sqrt(np.mean((string_df['label'].nunique() - string_df['cluster'].nunique())**2)))

RMSE: 3.0


In [None]:
def process_and_plot_combinations(df_normalized_labels, all_combinations_df):
    results = []

    # Iterate over each combination
    for index, row in tqdm(all_combinations_df.iterrows(), desc="Processing Combinations", total=all_combinations_df.shape[0]):
        labels_combination = row['combination']
        
        # Filter the dataset for the current combination of labels
        df_subset = df_normalized_labels[df_normalized_labels["Label"].isin(labels_combination)]
        
        # Ensure there's more than one unique label
        if len(df_subset["Label"].unique()) < 2:
            print(f"Skipping combination {labels_combination} due to insufficient label variety.")
            continue

        # Apply the clustering algorithm with the selected tau value
        selected_tau = 0.5  # Example value; replace with your method to set tau
        clusters = cluster_dataset(df_subset, selected_tau)
        
        # Ground truth labels
        true_labels = df_subset['Label'].to_numpy()
        
        # Calculate clustering metrics
        homogeneity = homogeneity_score(true_labels, clusters)
        completeness = completeness_score(true_labels, clusters)
        v_measure = v_measure_score(true_labels, clusters)
        
        # Calculate additional metrics
        count_of_devices = len(labels_combination)
        clusters_count = len(np.unique(clusters)) - (1 if -1 in clusters else 0)  # Subtract 1 if -1 (noise) is present
        noise_points = np.sum(np.array(clusters) == -1)
        rmse = np.sqrt((count_of_devices - clusters_count) ** 2)
        
        # Save the results
        results.append((
            labels_combination,
            homogeneity,
            v_measure,
            completeness,
            count_of_devices,
            noise_points,
            rmse,
        ))

    # Convert results into a DataFrame
    results_df = pd.DataFrame(
        results,
        columns=[
            "Labels",
            "Homogeneity",
            "V-Measure",
            "Completeness",
            "Count of Devices",
            "Noise Points",
            "RMSE",
        ]
    )

    # Plot the results
    plot_metrics(results_df)

def plot_metrics(results_df):
    if results_df.empty:
        print("No results to plot.")
        return
    
    results_df = results_df[results_df["RMSE"] != -3]
    sns.set(style="whitegrid")
    fig, axes = plt.subplots(3, 1, figsize=(15, 15))

    fig.suptitle("Clustering Metrics by Count of Devices")

    # Subplot 1: V-Measure
    sns.lineplot(
        ax=axes[0],
        x="Count of Devices",
        y="V-Measure",
        data=results_df,
        marker="o",
        linewidth=2,
        color='blue'
    )
    axes[0].set_title("V-Measure vs. Count of Devices")
    axes[0].set_xlabel("Count of Devices")
    axes[0].set_ylabel("V-Measure")

    # Subplot 2: Homogeneity
    sns.lineplot(
        ax=axes[1],
        x="Count of Devices",
        y="Homogeneity",
        data=results_df,
        marker="o",
        linewidth=2,
        color='green'
    )
    axes[1].set_title("Homogeneity vs. Count of Devices")
    axes[1].set_xlabel("Count of Devices")
    axes[1].set_ylabel("Homogeneity")

    # Subplot 3: Completeness
    sns.lineplot(
        ax=axes[2],
        x="Count of Devices",
        y="Completeness",
        data=results_df,
        marker="o",
        linewidth=2,
        color='red'
    )
    axes[2].set_title("Completeness vs. Count of Devices")
    axes[2].set_xlabel("Count of Devices")
    axes[2].set_ylabel("Completeness")

    # Adjust spacing between subplots
    plt.tight_layout(rect=[0, 0, 1, 0.95])

    # Show the combined figure
    plt.show()