In [28]:
M = 16

In [29]:
import re

import numpy as np
import pandas as pd

In [30]:
from tqdm.autonotebook import tqdm

## Importing Datasets

In [31]:
string_df = pd.read_csv("../../data/interim/string_df.csv", index_col=0)

balanced_pairs_df = pd.read_csv("../../data/train_test/test_pairs.csv", index_col=0)

balanced_pairs_df.drop_duplicates(inplace=True)
balanced_pairs_df.reset_index(drop=True, inplace=True)

In [32]:
string_df

Unnamed: 0_level_0,concatenated
label,Unnamed: 1_level_1
GooglePixel3A_L,0000000000000000000000000000000000000000000000...
GooglePixel3A_L,0000000000000000000000000000000000000000000000...
GooglePixel3A_L,0000000000000000000000000000000000000000000000...
GooglePixel3A_L,0000000000000000000000000000000000000000000000...
GooglePixel3A_L,0000000000000000000000000000000000000000000000...
...,...
iPhoneXSMax_M,0001101000101101000000000001101111111111000000...
iPhoneXSMax_M,0001101000101101010000000001101111111111000000...
iPhoneXSMax_M,0001101000101101000000000001101111111111000000...
iPhoneXSMax_M,0001101000101101010000000001101111111111000000...


In [33]:
string_df["concatenated"] = string_df["concatenated"].apply(
    lambda x: np.array(list(x)).astype(int)
)

## Importing Best Configurations

In [34]:
import re
import pandas as pd


def parse_log_file(filename):
    data = []

    with open(filename, "r") as file:
        lines = file.readlines()

        current_filter = None
        current_threshold = None
        current_min_error = None
        current_confidence = None

        for line in lines:
            if "Best Filter" in line:
                # Extract Best Filter using regex
                filter_match = re.search(r"Best Filter: (.+)", line)
                if filter_match:
                    current_filter = filter_match.group(1).strip()

            elif "Best Threshold" in line:
                # Extract Best Threshold using regex
                threshold_match = re.search(r"Best Threshold: (.+)", line)
                if threshold_match:
                    current_threshold = int(threshold_match.group(1).strip())

            elif "Min error" in line:
                # Extract Min Error using regex
                min_error_match = re.search(r"Min error: (.+)", line)
                if min_error_match:
                    current_min_error = float(min_error_match.group(1).strip())

            elif "Confidence" in line:
                # Extract Confidence using regex
                confidence_match = re.search(r"Confidence: (.+)", line)
                if confidence_match:
                    current_confidence = float(confidence_match.group(1).strip())

                    # Once we have all values, create a tuple and add it to the data list
                    data.append(
                        (
                            current_filter,
                            current_threshold,
                            current_min_error,
                            current_confidence,
                        )
                    )

                    # Reset current values for the next entry
                    current_filter = None
                    current_threshold = None
                    current_min_error = None
                    current_confidence = None

    # Convert the list of tuples into a DataFrame
    df = pd.DataFrame(
        data, columns=["Best Filter", "Best Threshold", "Min Error", "Confidence"]
    )

    return df

In [35]:
# Usage example:
filename = "../../reports/best_config"
best_configs_df = parse_log_file(filename)

In [36]:
best_configs_df

Unnamed: 0,Best Filter,Best Threshold,Min Error,Confidence
0,0[304] 🀆🀆🀆🀆🀆🀆🀆🀆🀫🀫🀫🀫🀫🀫🀫🀫 0[1464],1,0.453672,0.185844
1,0[1752] 🀆🀆🀆🀆🀫🀫🀫🀫 0[24],1,0.459875,0.160845
2,0[328] 🀫🀫🀫🀫🀆🀆🀆🀆 0[1448],1,0.460774,0.157226
3,0[304] 🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫 0[1464],2,0.461820,0.153019
4,0[1664] 🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫🀫 0[104],1,0.459515,0.162296
...,...,...,...,...
203,0[808] 🀫🀫🀫🀫🀫🀫🀫🀫 0[968],1,0.288404,0.903146
204,0[1696] 🀫🀫🀫🀫🀆🀆🀆🀆 0[80],1,0.288404,0.903146
205,0[1424] 🀫🀫🀫🀫🀆🀆🀆🀆 0[352],1,0.288404,0.903146
206,0[72] 🀫🀫🀫🀫🀫🀫🀫🀫 0[1704],1,0.288404,0.903146


In [37]:
if M != 0:
    best_configs_df = best_configs_df.head(M)

if M == 0:
    M = len(best_configs_df)

In [38]:
compression_rate = len(string_df["concatenated"].iloc[0]) / best_configs_df.shape[0]

print("Compression Rate:", compression_rate)

Compression Rate: 111.5


## Generate Matrix

## Filters Parser

In [39]:
def filter_parser(input_string: str) -> list:
    # Split the string into its parts
    parts = input_string.split()

    # Initialize the final array
    result = []

    # Process each part
    for part in parts:
        if part.startswith("0["):
            # Extract the number inside the brackets
            count = int(part[2:-1])
            # Append the corresponding number of zeros to the result
            result.extend([0] * count)
        else:
            # Translate the tiles to their respective values
            for char in part:
                if char == "🀆":
                    result.append(-1)
                elif char == "🀫":
                    result.append(1)

    return result

In [40]:
# import time

# time.sleep(100)

## Apply Filters

In [41]:
def apply_filter(item, filter):
    # item = np.array(list(item)).astype(int)
    item = item.astype(int)
    filter = filter_parser(filter)
    return np.sum(np.multiply(item, filter))

In [42]:
def apply_filter_threshold(item, filter, threshold) -> int:
    if apply_filter(item, filter) > threshold:
        return 1
    else:
        return -1

In [43]:
def apply_filter_threshold_pair(item_1, item_2, filter, threshold) -> int:
    if apply_filter_threshold(item_1, filter, threshold) == apply_filter_threshold(
        item_2, filter, threshold
    ):
        return 1
    else:
        return -1

## Calculate Fingerprint

In [44]:
def hamming_distance(array1, array2, confidence):
    # Check if arrays have the same length
    if len(array1) != len(array2):
        raise ValueError("Arrays must have the same length")

    # Initialize distance counter
    distance = 0

    # Iterate through arrays and count differences
    for i in range(len(array1)):
        if array1[i] != array2[i]:
            distance += confidence[i]

    distance = (distance / sum(confidence)) * len(confidence)

    return distance

In [45]:
def calculate_fingerprint(item, best_filters, best_thresholds, confidence):
    fingerprint = []

    for best_filter, best_threshold in zip(best_filters, best_thresholds):
        filtered = np.sum(np.multiply(item.astype(int), filter_parser(best_filter)))

        if filtered > best_threshold:
            filtered = 1
        else:
            filtered = -1

        fingerprint.append(filtered)

    return fingerprint

In [46]:
fingerprints = []

In [47]:
for i, row in tqdm(string_df.iterrows(), total=string_df.shape[0]):
    # Extracting best filters and thresholds from the main DataFrame
    best_filters = best_configs_df["Best Filter"].tolist()
    best_thresholds = best_configs_df["Best Threshold"].tolist()
    confidence = best_configs_df["Confidence"].tolist()

    # Calculate the fingerprint using the relevant best filters and thresholds
    fingerprint = calculate_fingerprint(
        row["concatenated"], best_filters, best_thresholds, confidence
    )

    # Store the result in the 'fprint' column
    # string_df.at[i, "fprint"] = fingerprint
    fingerprints.append(fingerprint)

  0%|          | 0/956 [00:00<?, ?it/s]

In [48]:
string_df["fprint"] = fingerprints

## Clustering

In [49]:
string_df = string_df.reset_index()

In [50]:
string_df

Unnamed: 0,label,concatenated,fprint
0,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1..."
1,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, -1,..."
2,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -..."
3,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, -1,..."
4,GooglePixel3A_L,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1..."
...,...,...,...
951,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1..."
952,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, -1,..."
953,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, -1, -1, -1, -1, -1, 1, -1, -1, 1, 1, -1, ..."
954,iPhoneXSMax_M,"[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...","[-1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, ..."


In [51]:
len(string_df["fprint"].iloc[0])

16

### DBSCAN

In [52]:
import pandas as pd
from sklearn.cluster import DBSCAN
import numpy as np

# Example dataframe
# string_df = pd.DataFrame({'fprint': [[1, 1, 1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1], [1, -1, 1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1], ...]})

# Convert the 'fprint' column to a numpy array (2D array)
feature_matrix = np.array(string_df["fprint"].tolist())

# Initialize DBSCAN
dbscan = DBSCAN(eps=0.05, min_samples=15)

# Fit DBSCAN
dbscan.fit(feature_matrix)

# Get the cluster labels
cluster_labels = dbscan.labels_

# Add the cluster labels to the original dataframe
string_df["cluster"] = cluster_labels

# Display the dataframe with cluster labels
print(string_df)

               label                                       concatenated  \
0    GooglePixel3A_L  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1    GooglePixel3A_L  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2    GooglePixel3A_L  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3    GooglePixel3A_L  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
4    GooglePixel3A_L  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
..               ...                                                ...   
951    iPhoneXSMax_M  [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...   
952    iPhoneXSMax_M  [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...   
953    iPhoneXSMax_M  [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...   
954    iPhoneXSMax_M  [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...   
955    iPhoneXSMax_M  [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, ...   

                                                fprint  cluster  
0    [1, 1, 1, 1, -1, 1, 1, -1, 1

In [53]:
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score

In [54]:
# Add the cluster labels to the original dataframe
string_df["cluster"] = cluster_labels

# Assuming 'true_labels' column contains the true labels
true_labels = string_df["label"]

# Calculate the evaluation metrics
homogeneity = homogeneity_score(true_labels, cluster_labels)
completeness = completeness_score(true_labels, cluster_labels)
v_measure = v_measure_score(true_labels, cluster_labels)

# Print the evaluation metrics
print(f"Homogeneity: {homogeneity:.3f}")
print(f"Completeness: {completeness:.3f}")
print(f"V-measure: {v_measure:.3f}")

Homogeneity: 0.702
Completeness: 0.675
V-measure: 0.688


In [59]:
print("RMSE:", string_df["cluster"].nunique() - 1 - string_df["label"].nunique())

RMSE: -2
