# Evaluate Similarity Grouping

In this notebook, we evaluate how effective a relation can be integrated using the NoiseAwareGroupBy Operator.
Therefore, we utilize the [Music Brainz 20K](https://dbs.uni-leipzig.de/research/projects/benchmark-datasets-for-entity-resolution).

The dataset contains modified (usign the DAPO data generator) song records from different sources.
The goal is to group same songs into buckets. E.g. The records {"title": "Daniel Balavoine - L'enfant aux yeux d'Italie", "artist": null, "album": "De vous à elle en passant par moi", ...} and {"name": L'enfant aux yeux d'Italie - De vous à elle en passant par moi", "artist": "Daniel Balavoine", "album": null} describe the same song.

The column "CID" describes the cluster of the record. Using the  `SoftAggregateScikit` operator, we determine clusters and calculate the metrics:
* Adjusted Rand Index (ARI)
* Normalized Mutual Information (NMI)
* Fowlkes-Mallows Index (FMI)


In [1]:
%%capture
!pip3 install faiss-gpu-cu12
!pip3 install pgvector

In [2]:
%%capture
!rm -rf SofteningQueryEvaluation
!git clone https://github.com/HackerBschor/SofteningQueryEvaluation
%cd SofteningQueryEvaluation

In [3]:
from huggingface_hub import notebook_login
notebook_login()

In [4]:
import pandas as pd
import kagglehub
import time

from models import ModelMgr
from models.embedding.SentenceTransformer import SentenceTransformerEmbeddingModel

from db.operators import Dummy, SoftAggregateScikit
from db.operators.Aggregate import SetAggregation
from sklearn.cluster import KMeans, DBSCAN, HDBSCAN

from sklearn.metrics import rand_score, adjusted_rand_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import mutual_info_score, adjusted_mutual_info_score, normalized_mutual_info_score
from sklearn.metrics import homogeneity_score, completeness_score
from sklearn.metrics import v_measure_score, homogeneity_completeness_v_measure

from evaluation.util import calc_bleu

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nico\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Nico\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
m = ModelMgr()
stem = SentenceTransformerEmbeddingModel(m)

In [6]:
subset_categories = None

In [7]:
path = kagglehub.dataset_download("lakritidis/product-classification-and-categorization")

df_products = pd.read_csv(f"{path}/pricerunner_aggregate.csv", header=None, index_col=0)
df_products.drop(columns=[2, 3, 5], inplace=True)
df_products.columns = ["product", "category_1", "category_2"]

if subset_categories is not None:
    result = []
    x = df_products["category_1"].value_counts()
    for x in pd.Series(x[x >= 10].index).sample(n=subset_categories, random_state=42):
        result.append(df_products[df_products["category_1"] == x].sample(10))
    df_products = pd.concat(result)


print(len(df_products))
df_products.head(2)

35311


Unnamed: 0_level_0,product,category_1,category_2
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,apple iphone 8 plus 64gb silver,Apple iPhone 8 Plus 64GB,Mobile Phones
2,apple iphone 8 plus 64 gb spacegrau,Apple iPhone 8 Plus 64GB,Mobile Phones


In [12]:
def evaluate(df, eps, cluster_class, cluster_params, serialization_mode, reduce_dimensions, target):
    key = (target, str(eps), str(cluster_class), str(serialization_mode), str(reduce_dimensions))

    if cluster_class == KMeans:
        cluster_params = {"n_clusters": len(df[target].unique())}

    d = Dummy("data", ["id", "product"], [(x[0], x[1]) for x in df.itertuples(name=None)])
    agg = SoftAggregateScikit(
        d,
        ["product"],
        [SetAggregation("id", "ids"), SetAggregation("product", "products")],
        em=stem,
        cluster_class = cluster_class,
        cluster_params = cluster_params,
        serialization_mode = serialization_mode,
        reduce_dimensions = reduce_dimensions
    )

    tic = time.time()
    result = agg.open().fetch_all()
    toc = time.time()

    cluster_map = {x: i for i, x in enumerate(df[target].unique())}
    true_labels = [cluster_map[x] for x in df.sort_index()[target]]

    pred_ids, pred_text = [], []

    for i, row in enumerate(result):
        pred_ids.append(pd.Series([i for _ in range(len(row["ids"]))], index=[int(idx) for idx in row["ids"]]))
        pred_text.append(row["products"])

    pred_labels = list(pd.concat(pred_ids).sort_index())

    #gt_blue = set([x[0] for x in df_products.groupby("category_1")["product"].agg(function=lambda x: ", ".join(sorted(x))).values])
    #pred_blue = set([", ".join(sorted(x)) for x in pred_text])

    scores = {
        "rand_score": rand_score(true_labels, pred_labels),
        "adjusted_rand_score": adjusted_rand_score(true_labels, pred_labels),
        "fowlkes_mallows_score": fowlkes_mallows_score(true_labels, pred_labels),
        "mutual_info_score": mutual_info_score(true_labels, pred_labels),
        "adjusted_mutual_info_score": adjusted_mutual_info_score(true_labels, pred_labels),
        "normalized_mutual_info_score": normalized_mutual_info_score(true_labels, pred_labels),
        "homogeneity_score": homogeneity_score(true_labels, pred_labels),
        "completeness_score": completeness_score(true_labels, pred_labels),
        "v_measure_score": v_measure_score(true_labels, pred_labels),
        "homogeneity_completeness_v_measure": homogeneity_completeness_v_measure(true_labels, pred_labels),
        "runtime": toc-tic,
        "pred": pred_text,
    }

    # scores_bleu = calc_bleu(gt_blue, pred_blue)

    return key, scores #| scores_bleu

In [9]:
overall_results = {}

In [14]:
for target in ["category_1", "category_2"]:
    for eps in [.01, .02, .03, .04, .05, .06, .06, .07, .08, .09, .1, .2, .3, .4, .5, .6, .7, .8, .9]:
        cc = DBSCAN
        cp = {"eps": eps, "min_samples": 1, "metric": "cosine"}
        res = evaluate(df_products, eps, cluster_class=cc, cluster_params=cp, serialization_mode = "FULL_SERIALIZED", reduce_dimensions = None, target = target)
        overall_results[res[0]] = res[1]
        print(res[0], res[1]["adjusted_rand_score"])
        if res[1]["adjusted_rand_score"] == 0:
            break

('category_1', "<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', 'None') 0.06511614036072338



KeyboardInterrupt



In [51]:
for target in ["category_1", "category_2"]:
    for eps in [.01, .02, .03, .04, .05, .06, .06, .07, .08, .09, .1, .2, .3, .4, .5, .6, .7, .8, .9]:
        cc = HDBSCAN
        cp = {"cluster_selection_epsilon": eps, "min_cluster_size": 2, "metric": "cosine"}
        res = evaluate(df_products, eps, cluster_class=cc, cluster_params=cp, serialization_mode = "FULL_SERIALIZED", reduce_dimensions = None, target = target)
        overall_results[res[0]] = res[1]
        print(res[0], res[1]["adjusted_rand_score"])

('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None') 0.5071360019813628
('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None') 0.4468573127438554
('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None') 0.05595587956091534
('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None') 0.05595587956091534
('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None') 0.05595587956091534
('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None') 0.05595587956091534
('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None') 0.05595587956091534
('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None') 0.05595587956091534
('category_1', "<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL

KeyboardInterrupt: 

In [72]:
keys = ["cluster", "serialization", "dimension", "drop_na"]
evaluation_results_list = [v | {ki: vi for ki, vi in zip(keys, k)} for k, v in overall_results.items()]
df_evaluation_results = pd.DataFrame.from_records(evaluation_results_list)
df_evaluation_results["cluster"] = df_evaluation_results["cluster"].apply(lambda x: x.split(".")[-1].replace("'>", ""))
df_evaluation_results = df_evaluation_results.set_index(keys)
df_evaluation_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,rand_score,adjusted_rand_score,fowlkes_mallows_score,mutual_info_score,adjusted_mutual_info_score,normalized_mutual_info_score,homogeneity_score,completeness_score,v_measure_score,homogeneity_completeness_v_measure,runtime,pred
cluster,serialization,dimension,drop_na,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
KMeans,FIELD_SERIALIZED,,True,0.999987,0.722766,0.722914,7.937553,0.736875,0.998469,0.998429,0.998509,0.998469,"(0.9984286289107432, 0.9985094085043371, 0.998...",110.525394,4 0 5 1 9 2 10 ...
KMeans,FIELD_SERIALIZED,2.0,True,0.999951,-2.5e-05,0.0,7.901016,-2.6e-05,0.994026,0.993833,0.99422,0.994026,"(0.9938327384406908, 0.9942198343219951, 0.994...",53.860461,4 0 5 1 9 2 10 ...
KMeans,FIELD_SERIALIZED,10.0,True,0.999947,0.008485,0.008618,7.899162,0.00957,0.993938,0.9936,0.994277,0.993938,"(0.9935995138490926, 0.9942769177684889, 0.993...",54.437845,4 0 5 1 9 2 10 ...
KMeans,FIELD_SERIALIZED,50.0,True,0.999949,0.017519,0.017698,7.900311,0.019306,0.99404,0.993744,0.994336,0.99404,"(0.9937440661637883, 0.9943361763255057, 0.994...",56.846164,4 0 5 1 9 2 10 ...
KMeans,FIELD_SERIALIZED,100.0,True,0.999944,0.007973,0.008179,7.897666,0.009422,0.993843,0.993411,0.994276,0.993843,"(0.9934113839666784, 0.9942758401487835, 0.993...",60.259333,4 0 5 1 9 2 10 ...


In [81]:
df_evaluation_results.to_pickle("EvaluateClustering.pkl")
df_evaluation_results.drop(columns=["pred"]).to_csv("EvaluateClustering.csv")