# Evaluate Similarity Grouping

In this notebook, we evaluate how effective a relation can be integrated using the NoiseAwareGroupBy Operator.
Therefore, we utilize the [Music Brainz 20K](https://dbs.uni-leipzig.de/research/projects/benchmark-datasets-for-entity-resolution).

The dataset contains modified (usign the DAPO data generator) song records from different sources.
The goal is to group same songs into buckets. E.g. The records {"title": "Daniel Balavoine - L'enfant aux yeux d'Italie", "artist": null, "album": "De vous à elle en passant par moi", ...} and {"name": L'enfant aux yeux d'Italie - De vous à elle en passant par moi", "artist": "Daniel Balavoine", "album": null} describe the same song.

The column "CID" describes the cluster of the record. Using the  `SoftAggregateScikit` operator, we determine clusters and calculate the metrics:
* Adjusted Rand Index (ARI)
* Normalized Mutual Information (NMI)
* Fowlkes-Mallows Index (FMI)


In [1]:
import pandas as pd
from models import ModelMgr
from models.embedding.SentenceTransformer import SentenceTransformerEmbeddingModel

from db.operators import Dummy, SoftAggregateScikit
from db.operators.Aggregate import SetAggregation
from sklearn.cluster import KMeans, DBSCAN, HDBSCAN

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score

In [2]:
m = ModelMgr()
stem = SentenceTransformerEmbeddingModel(m)

drop_na = True

In [3]:
df_music = pd.read_csv("../data/musicbrainz-20-A01.csv", index_col=0).drop(columns=["length"], axis=1)
significant_cols = ["title", "artist", "album", "year", "language"]
df_music.head()

Unnamed: 0_level_0,CID,CTID,SourceID,id,number,title,artist,album,year,language
TID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,1,2,MBox7368722-HH,9,Daniel Balavoine - L'enfant aux yeux d'Italie,,De vous à elle en passant par moi,75.0,French
2,2512,5,4,139137-A047,7,007,[unknown],Cantigas de roda (unknown),,Por.
3,2,1,2,MBox38440522-HH,17,Action PAINTING! - Mustard Gas,,There and Back Again Lane,95.0,English
4,3,1,5,4489993,10,Your Grace,Kathy Troccoli,Comfort,2005.0,English
5,4,1,5,10339621,2,Well You Needn't,Ernie Stadler Jazz Quintet,First Down,2010.0,English


In [4]:
def evaluate(df, cluster_columns, id_column, cluster_class, cluster_params, serialization_mode, reduce_dimensions, drop_na):
    key = (str(cluster_class), str(serialization_mode), str(reduce_dimensions), str(drop_na))

    if drop_na:
        df = df.dropna()

    if cluster_class == KMeans:
        cluster_params = {"n_clusters": len(df["CID"].unique())}

    columns = [col.strip() for col in df.columns]
    data = [[str(y) for y in x] for x in df.itertuples(name=None)]

    d = Dummy("data", ["tid"] + columns, data).open()
    agg = SoftAggregateScikit(
        d,
        cluster_columns,
        [SetAggregation("tid", "ids")],
        em=stem,
        cluster_class = cluster_class,
        cluster_params = cluster_params,
        serialization_mode = serialization_mode,
        reduce_dimensions=reduce_dimensions
    )

    predictions = []
    for i, row in enumerate(agg.open()):
        predictions.append(pd.Series([i for _ in range(len(row["ids"]))], index=[int(idx) for idx in row["ids"]]))

    predicted_labels = pd.concat(predictions).sort_index()
    true_labels = df[id_column]

    ari = adjusted_rand_score(true_labels, predicted_labels)
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    fmi = fowlkes_mallows_score(true_labels, predicted_labels)

    result = {"adjusted_rand_score": ari, "normalized_mutual_info_score": nmi, "fowlkes_mallows_score": fmi}

    print(key, result)

    return key, result

overall_results = {}

In [5]:
cluster_classes = [
    (KMeans, None),
    (DBSCAN, {"eps": 0.1, "min_samples": 1}),
    (HDBSCAN, {"min_cluster_size": 2}),
]

for cc, cp in cluster_classes:
    for sm in ["FIELD_SERIALIZED", "FULL_SERIALIZED"]:
        for dn in [True, False]:
            for dim in [2, 10, 50, 100, None]:
                res = evaluate(df_music, significant_cols, "CID", cluster_class=cc, cluster_params=cp, serialization_mode = sm, reduce_dimensions = dim, drop_na = dn)
                overall_results[res[0]] = res[1]



("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', '2', 'True') {'adjusted_rand_score': -2.4531393179505998e-05, 'normalized_mutual_info_score': 0.9940243916923467, 'fowlkes_mallows_score': 0.0}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', '10', 'True') {'adjusted_rand_score': 0.023228762452172664, 'normalized_mutual_info_score': 0.9938931018904221, 'fowlkes_mallows_score': 0.023911404992940522}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', '50', 'True') {'adjusted_rand_score': 0.03289605635723605, 'normalized_mutual_info_score': 0.9940368569902648, 'fowlkes_mallows_score': 0.0335012605086404}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', '100', 'True') {'adjusted_rand_score': 0.030507245985220293, 'normalized_mutual_info_score': 0.9939579099007047, 'fowlkes_mallows_score': 0.03148825485739376}
("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', 'None', 'True') {'adjusted_rand_score': 0.7389102451369347, 'normalized_mutual_info_score': 0.9985753857782258, 'fowlkes_mallows_score': 0.7391404949863534}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', '2', 'False') {'adjusted_rand_score': 0.001035308634552346, 'normalized_mutual_info_score': 0.9064378656949035, 'fowlkes_mallows_score': 0.0011328376915326858}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', '10', 'False') {'adjusted_rand_score': 0.002914061088592513, 'normalized_mutual_info_score': 0.8966928417285542, 'fowlkes_mallows_score': 0.003273256646500345}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', '50', 'False') {'adjusted_rand_score': 0.0032003251951670127, 'normalized_mutual_info_score': 0.8978554257390012, 'fowlkes_mallows_score': 0.0035186516874743325}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', '100', 'False') {'adjusted_rand_score': 0.0037893864915525518, 'normalized_mutual_info_score': 0.8986826736787318, 'fowlkes_mallows_score': 0.00410564982286738}
("<class 'sklearn.cluster._kmeans.KMeans'>", 'FIELD_SERIALIZED', 'None', 'False') {'adjusted_rand_score': 0.1422541583252007, 'normalized_mutual_info_score': 0.9222967737707368, 'fowlkes_mallows_score': 0.147723088243923}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', '2', 'True') {'adjusted_rand_score': -2.756085893098569e-05, 'normalized_mutual_info_score': 0.9938221109864703, 'fowlkes_mallows_score': 0.0}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', '10', 'True') {'adjusted_rand_score': 0.01473204577127165, 'normalized_mutual_info_score': 0.9937275163019804, 'fowlkes_mallows_score': 0.01532668299572343}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', '50', 'True') {'adjusted_rand_score': 0.07140159592453649, 'normalized_mutual_info_score': 0.994145555151454, 'fowlkes_mallows_score': 0.07470387248392132}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', '100', 'True') {'adjusted_rand_score': 0.09051075938276242, 'normalized_mutual_info_score': 0.9944549190291403, 'fowlkes_mallows_score': 0.09212846639876111}
("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', 'None', 'True') {'adjusted_rand_score': 0.7547111056763247, 'normalized_mutual_info_score': 0.9987500218345492, 'fowlkes_mallows_score': 0.7563680370631994}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', '2', 'False') {'adjusted_rand_score': 0.15390209649627837, 'normalized_mutual_info_score': 0.9247439743197702, 'fowlkes_mallows_score': 0.15409167339972343}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', '10', 'False') {'adjusted_rand_score': 0.42462016537083364, 'normalized_mutual_info_score': 0.9537479482951512, 'fowlkes_mallows_score': 0.43871081354174374}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', '50', 'False') {'adjusted_rand_score': 0.44183974863769193, 'normalized_mutual_info_score': 0.9546620529058564, 'fowlkes_mallows_score': 0.45328921479317685}




("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', '100', 'False') {'adjusted_rand_score': 0.43928056165051144, 'normalized_mutual_info_score': 0.9543528482265697, 'fowlkes_mallows_score': 0.45053489765287114}
("<class 'sklearn.cluster._kmeans.KMeans'>", 'FULL_SERIALIZED', 'None', 'False') {'adjusted_rand_score': 0.8177208995930404, 'normalized_mutual_info_score': 0.9884735437395927, 'fowlkes_mallows_score': 0.8187102514807594}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', '2', 'True') {'adjusted_rand_score': 0.00016076962804905103, 'normalized_mutual_info_score': 0.6565192710050302, 'fowlkes_mallows_score': 0.004889054115215978}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', '10', 'True') {'adjusted_rand_score': 0.004276257870122506, 'normalized_mutual_info_score': 0.9517533931491164, 'fowlkes_mallows_score': 0.017612947414016113}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', '50', 'True') {'adjusted_rand_score': 0.005474834652593021, 'normalized_mutual_info_score': 0.9612817606270778, 'fowlkes_mallows_score': 0.018499891602515212}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', '100', 'True') {'adjusted_rand_score': 0.0051461036281029516, 'normalized_mutual_info_score': 0.9563002389065188, 'fowlkes_mallows_score': 0.019334656026882564}
("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', 'None', 'True') {'adjusted_rand_score': 0.0, 'normalized_mutual_info_score': 0.9971193644382403, 'fowlkes_mallows_score': 0.0}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', '2', 'False') {'adjusted_rand_score': -6.45726999082863e-05, 'normalized_mutual_info_score': 0.4244311045956481, 'fowlkes_mallows_score': 0.0014990368777767363}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', '10', 'False') {'adjusted_rand_score': 0.00016410838783171552, 'normalized_mutual_info_score': 0.7439814273074392, 'fowlkes_mallows_score': 0.0014703391020101891}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', '50', 'False') {'adjusted_rand_score': 0.00028961640385466496, 'normalized_mutual_info_score': 0.7703414335934355, 'fowlkes_mallows_score': 0.0017673344373554025}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', '100', 'False') {'adjusted_rand_score': 0.0002761793623244787, 'normalized_mutual_info_score': 0.767771493379952, 'fowlkes_mallows_score': 0.0017290964786796073}
("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FIELD_SERIALIZED', 'None', 'False') {'adjusted_rand_score': 0.0008610930005805606, 'normalized_mutual_info_score': 0.9561426619562456, 'fowlkes_mallows_score': 0.020754980866510835}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', '2', 'True') {'adjusted_rand_score': 0.00027300607603154365, 'normalized_mutual_info_score': 0.6224328510070495, 'fowlkes_mallows_score': 0.011405482293920735}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', '10', 'True') {'adjusted_rand_score': 0.013676659324470817, 'normalized_mutual_info_score': 0.9484393888377555, 'fowlkes_mallows_score': 0.06267511942419624}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', '50', 'True') {'adjusted_rand_score': 0.015642725669675846, 'normalized_mutual_info_score': 0.9541473379336133, 'fowlkes_mallows_score': 0.06343237021855025}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', '100', 'True') {'adjusted_rand_score': 0.017306686289606972, 'normalized_mutual_info_score': 0.9558997177783989, 'fowlkes_mallows_score': 0.06997098713829958}
("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', 'None', 'True') {'adjusted_rand_score': 0.2782563569718429, 'normalized_mutual_info_score': 0.9975867372378017, 'fowlkes_mallows_score': 0.40201512610368484}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', '2', 'False') {'adjusted_rand_score': 0.0002945619284069862, 'normalized_mutual_info_score': 0.4071903346553901, 'fowlkes_mallows_score': 0.014861438618492843}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', '10', 'False') {'adjusted_rand_score': 0.046245241572788094, 'normalized_mutual_info_score': 0.913793562283398, 'fowlkes_mallows_score': 0.12856499929559168}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', '50', 'False') {'adjusted_rand_score': 0.052829305476311025, 'normalized_mutual_info_score': 0.9179391124387679, 'fowlkes_mallows_score': 0.13659432353925358}




("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', '100', 'False') {'adjusted_rand_score': 0.05228121163797588, 'normalized_mutual_info_score': 0.9191013873206045, 'fowlkes_mallows_score': 0.13551327273825686}
("<class 'sklearn.cluster._dbscan.DBSCAN'>", 'FULL_SERIALIZED', 'None', 'False') {'adjusted_rand_score': 0.004664718769860029, 'normalized_mutual_info_score': 0.9562429617492978, 'fowlkes_mallows_score': 0.04655485577920836}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', '2', 'True') {'adjusted_rand_score': 0.00692762164443134, 'normalized_mutual_info_score': 0.9328748459562475, 'fowlkes_mallows_score': 0.0199952744026458}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', '10', 'True') {'adjusted_rand_score': 0.007287510651795684, 'normalized_mutual_info_score': 0.9271057026580594, 'fowlkes_mallows_score': 0.032976099381732266}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', '50', 'True') {'adjusted_rand_score': 0.005653163165870449, 'normalized_mutual_info_score': 0.9238929414622207, 'fowlkes_mallows_score': 0.025884804715589047}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', '100', 'True') {'adjusted_rand_score': 0.0073937529958919, 'normalized_mutual_info_score': 0.9276641746361443, 'fowlkes_mallows_score': 0.02833566903406462}
("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', 'None', 'True') {'adjusted_rand_score': 0.05192791393826211, 'normalized_mutual_info_score': 0.9683860835673215, 'fowlkes_mallows_score': 0.16008995236541204}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', '2', 'False') {'adjusted_rand_score': 0.0016708004432567252, 'normalized_mutual_info_score': 0.9001678242884283, 'fowlkes_mallows_score': 0.001775617797510228}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', '10', 'False') {'adjusted_rand_score': 0.004038618578119086, 'normalized_mutual_info_score': 0.8998895397336668, 'fowlkes_mallows_score': 0.004626072513792373}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', '50', 'False') {'adjusted_rand_score': 0.0006722905719256655, 'normalized_mutual_info_score': 0.8826317569394206, 'fowlkes_mallows_score': 0.0018297411153209856}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', '100', 'False') {'adjusted_rand_score': 0.0030033114562003967, 'normalized_mutual_info_score': 0.8957471448556394, 'fowlkes_mallows_score': 0.003800845448259246}
("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FIELD_SERIALIZED', 'None', 'False') {'adjusted_rand_score': 0.265633749556373, 'normalized_mutual_info_score': 0.9507731980003694, 'fowlkes_mallows_score': 0.2709739055803423}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', '2', 'True') {'adjusted_rand_score': 0.025843536625348235, 'normalized_mutual_info_score': 0.9329096294904448, 'fowlkes_mallows_score': 0.07525794130770143}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', '10', 'True') {'adjusted_rand_score': 0.029034524405226723, 'normalized_mutual_info_score': 0.9298896337353193, 'fowlkes_mallows_score': 0.10802806025746284}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', '50', 'True') {'adjusted_rand_score': 0.02512361554600598, 'normalized_mutual_info_score': 0.9278351031823528, 'fowlkes_mallows_score': 0.10349135227034695}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', '100', 'True') {'adjusted_rand_score': 0.024956877947641107, 'normalized_mutual_info_score': 0.9287032831342533, 'fowlkes_mallows_score': 0.10003626231835393}
("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None', 'True') {'adjusted_rand_score': 0.020980038415771833, 'normalized_mutual_info_score': 0.9555327392749134, 'fowlkes_mallows_score': 0.10307015791448253}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', '2', 'False') {'adjusted_rand_score': 0.1907136598045463, 'normalized_mutual_info_score': 0.9188190400657554, 'fowlkes_mallows_score': 0.19286428952648177}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', '10', 'False') {'adjusted_rand_score': 0.5204453748525378, 'normalized_mutual_info_score': 0.956562231167877, 'fowlkes_mallows_score': 0.5305845184758278}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', '50', 'False') {'adjusted_rand_score': 0.5181536880148707, 'normalized_mutual_info_score': 0.9566549887859075, 'fowlkes_mallows_score': 0.528531015577852}




("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', '100', 'False') {'adjusted_rand_score': 0.527119186440687, 'normalized_mutual_info_score': 0.9572422943253446, 'fowlkes_mallows_score': 0.5362443961617454}
("<class 'sklearn.cluster._hdbscan.hdbscan.HDBSCAN'>", 'FULL_SERIALIZED', 'None', 'False') {'adjusted_rand_score': 0.871507546722749, 'normalized_mutual_info_score': 0.9923116420069015, 'fowlkes_mallows_score': 0.8731580103378349}


In [29]:
keys = ["cluster", "serialization", "dimension", "drop_na"]
evaluation_results_list = [v | {ki: vi for ki, vi in zip(keys, k)} for k, v in overall_results.items()]
df_evaluation_results = pd.DataFrame.from_records(evaluation_results_list)
df_evaluation_results["cluster"] = df_evaluation_results["cluster"].apply(lambda x: x.split(".")[-1].replace("'>", ""))
df_evaluation_results = df_evaluation_results.set_index(keys)
df_evaluation_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,adjusted_rand_score,normalized_mutual_info_score,fowlkes_mallows_score
cluster,serialization,dimension,drop_na,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KMeans,FIELD_SERIALIZED,2.0,True,-2.5e-05,0.994024,0.0
KMeans,FIELD_SERIALIZED,10.0,True,0.023229,0.993893,0.023911
KMeans,FIELD_SERIALIZED,50.0,True,0.032896,0.994037,0.033501
KMeans,FIELD_SERIALIZED,100.0,True,0.030507,0.993958,0.031488
KMeans,FIELD_SERIALIZED,,True,0.73891,0.998575,0.73914


In [32]:
df_evaluation_results.to_csv("results/EvaluateClustering.csv")