# Evaluate Similarity Grouping

In this notebook, we evaluate how effective a relation can be integrated using the NoiseAwareGroupBy Operator.
Therefore, we utilize the [Music Brainz 20K](https://dbs.uni-leipzig.de/research/projects/benchmark-datasets-for-entity-resolution).

The dataset contains modified (usign the DAPO data generator) song records from different sources.
The goal is to group same songs into buckets. E.g. The records {"title": "Daniel Balavoine - L'enfant aux yeux d'Italie", "artist": null, "album": "De vous à elle en passant par moi", ...} and {"name": L'enfant aux yeux d'Italie - De vous à elle en passant par moi", "artist": "Daniel Balavoine", "album": null} describe the same song.

The column "CID" describes the cluster of the record. Using the  `SoftAggregateScikit` operator, we determine clusters and calculate the metrics:
* Adjusted Rand Index (ARI)
* Normalized Mutual Information (NMI)
* Fowlkes-Mallows Index (FMI)


In [1]:
import pandas as pd
from models import ModelMgr
from models.embedding.SentenceTransformer import SentenceTransformerEmbeddingModel
from models.semantic_validation import LLaMAValidationModel

from db.operators import Dummy, SoftAggregateScikit
from db.operators.Aggregate import SetAggregation, StringAggregation
from sklearn.cluster import KMeans, DBSCAN, HDBSCAN
import tqdm

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score

In [2]:
m = ModelMgr()
stem = SentenceTransformerEmbeddingModel(m) #model_path="sentence-transformers/all-MiniLM-L6-v2"
sv = LLaMAValidationModel(m)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
df_music = pd.read_csv("../data/musicbrainz-20-A01.csv", index_col=0)
df_music.head()

Unnamed: 0_level_0,CID,CTID,SourceID,id,number,title,length,artist,album,year,language
TID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,1,2,MBox7368722-HH,9,Daniel Balavoine - L'enfant aux yeux d'Italie,219,,De vous à elle en passant par moi,75.0,French
2,2512,5,4,139137-A047,7,007,1m 58sec,[unknown],Cantigas de roda (unknown),,Por.
3,2,1,2,MBox38440522-HH,17,Action PAINTING! - Mustard Gas,129,,There and Back Again Lane,95.0,English
4,3,1,5,4489993,10,Your Grace,unk.,Kathy Troccoli,Comfort,2005.0,English
5,4,1,5,10339621,2,Well You Needn't,321266,Ernie Stadler Jazz Quintet,First Down,2010.0,English


In [5]:
def evaluate(df, cluster_columns, id_column, cluster_class, cluster_params, serialization_mode, reduce_dimensions):
    columns = [col.strip() for col in df.columns]
    data = [[str(y) for y in x] for x in df.itertuples(name=None)]

    d = Dummy("data", ["tid"] + columns, data).open()
    agg = SoftAggregateScikit(
        d,
        cluster_columns,
        [SetAggregation("tid", "ids")],
        em=stem,
        cluster_class = cluster_class,
        cluster_params = cluster_params,
        serialization_mode = serialization_mode,
        reduce_dimensions=reduce_dimensions
    )

    predictions = []
    for i, row in enumerate(agg.open()):
        predictions.append(pd.Series([i for _ in range(len(row["ids"]))], index=[int(idx) for idx in row["ids"]]))

    predicted_labels = pd.concat(predictions).sort_index()
    true_labels = df[id_column]

    ari = adjusted_rand_score(true_labels, predicted_labels)
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    fmi = fowlkes_mallows_score(true_labels, predicted_labels)

    print(f"Adjusted Rand Index (ARI): {ari}")
    print(f"Normalized Mutual Information (NMI): {nmi}")
    print(f"Fowlkes-Mallows Index (FMI): {fmi}")

    return predicted_labels



cols = ["title", "artist", "album", "year", "language"]
pred = evaluate(df_music, cols, "CID", cluster_class=HDBSCAN, cluster_params={"min_cluster_size": 2}, serialization_mode = "FULL_SERIALIZED", reduce_dimensions = None)
# pred = evaluate(df_music, cols, cluster_class=DBSCAN, cluster_params={"eps": 0.1, "min_samples": 1}, serialization_mode = "FULL_SERIALIZED", reduce_dimensions = 100)
# pred = evaluate(df_music, cols, cluster_class=KMeans, cluster_params={"n_clusters": len(df["CID"].unique())}, serialization_mode = "FULL_SERIALIZED", reduce_dimensions = 100)

Adjusted Rand Index (ARI): 0.871507546722749
Normalized Mutual Information (NMI): 0.9923116420069015
Fowlkes-Mallows Index (FMI): 0.8731580103378349
