# Evaluate Similarity Grouping


In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from models import ModelMgr
from models.embedding.SentenceTransformer import SentenceTransformerEmbeddingModel
from models.semantic_validation import LLaMAValidationModel

from db.operators import Dummy, Select
from db.criteria import SoftEqual
from db.structure import Column, Constant
import kagglehub

from util import calculate_metrics

import time

[nltk_data] Downloading package punkt to /home/nico/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/nico/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
m = ModelMgr()
stem = SentenceTransformerEmbeddingModel(m)
lsv = LLaMAValidationModel(m)
# lsv = DeepSeekValidationModel(m)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
#system_prompt = "You are an expert in checking semantic equivalence. Respond with \"yes\" and \"no\" only!"
#prompt_template = "Is \"{a}\" semantically equal to \"{b}\"?"

system_prompt = "Respond with \"yes\"/ \"no\" only!"
prompt_template = 'Can "{}" be referred as  "{}"'

In [4]:
path = kagglehub.dataset_download("lakritidis/product-classification-and-categorization")

df = pd.read_csv(f"{path}/pricerunner_aggregate.csv", header=None, index_col=0)
df.drop(columns=[2, 3, 5], inplace=True)
df.columns = ["product", "category_1", "category_2"]
df.head()

Unnamed: 0_level_0,product,category_1,category_2
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,apple iphone 8 plus 64gb silver,Apple iPhone 8 Plus 64GB,Mobile Phones
2,apple iphone 8 plus 64 gb spacegrau,Apple iPhone 8 Plus 64GB,Mobile Phones
3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,Apple iPhone 8 Plus 64GB,Mobile Phones
4,apple iphone 8 plus 64gb space grey,Apple iPhone 8 Plus 64GB,Mobile Phones
5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,Apple iPhone 8 Plus 64GB,Mobile Phones


In [5]:
categories = list(df["category_2"].unique())
print(categories)

['Mobile Phones', 'TVs', 'CPUs', 'Digital Cameras', 'Microwaves', 'Dishwashers', 'Washing Machines', 'Freezers', 'Fridge Freezers', 'Fridges']


In [6]:
df_reduced = pd.concat([df[df["category_2"] == category].sample(10) for category in categories])
gt = {(x["product"], x["category_2"]) for key, x in df_reduced.iterrows()}
print(str(gt)[0:100], "...", len(gt))

{('russell hobbs rhm2064g compact solo microwave grey', 'Microwaves'), ('indesit dsr15bk slimline di ... 100


In [7]:
overall_result = {}

def evaluate(method, threshold, fetch_one = False, calc_bleu = False):
    pred = []
    runtimes = []
    for category in tqdm(categories):
        d = Dummy("products", ["product"], [(x[0], ) for x in gt])
        s = Select(d, SoftEqual(Column("product"), Constant(category), method=method, em=stem, sv=lsv, threshold=threshold, zfs_system_prompt=system_prompt, zfs_prompt_template = prompt_template))

        tic = time.time()
        if fetch_one:
            result = [s.open().fetch_one()]
        else:
            result = s.open().fetch_all()
        toc = time.time()

        pred.extend([(x["product"], category) for x in result])
        runtimes.append(toc - tic)

    pred = set(pred)

    scores = calculate_metrics(gt, pred, calc_bleu=calc_bleu)
    scores["Runtime"] = np.mean(runtimes)

    print(scores)

    return scores, pred

In [8]:
for thresh in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    res, _ = evaluate("threshold", thresh, False, True)
    overall_result[("threshold", thresh)] = res
    if res["Recall"] == 0.0:
        break

100%|██████████| 10/10 [00:04<00:00,  2.40it/s]


{'Precision': 0.10675381263616558, 'Recall': 0.98, 'F1 Score': 0.1925343811394892, 'bleu1': 0.9980909090909091, 'bleu2': 0.9979896122473321, 'bleu3': 0.9978719262778781, 'bleu4': 0.9977326571860027, 'Runtime': 0.4159417629241943}


100%|██████████| 10/10 [00:03<00:00,  3.02it/s]


{'Precision': 0.10738255033557047, 'Recall': 0.96, 'F1 Score': 0.193158953722334, 'bleu1': 0.9958505971793201, 'bleu2': 0.9954864951319502, 'bleu3': 0.9951630778719928, 'bleu4': 0.9947552641197736, 'Runtime': 0.33110885620117186}


100%|██████████| 10/10 [00:03<00:00,  2.98it/s]


{'Precision': 0.10983981693363844, 'Recall': 0.96, 'F1 Score': 0.19712525667351127, 'bleu1': 0.9958505971793201, 'bleu2': 0.9954864951319502, 'bleu3': 0.9951630778719928, 'bleu4': 0.9947552641197736, 'Runtime': 0.3356718301773071}


100%|██████████| 10/10 [00:03<00:00,  2.97it/s]


{'Precision': 0.11018957345971564, 'Recall': 0.93, 'F1 Score': 0.19703389830508472, 'bleu1': 0.9921655844155844, 'bleu2': 0.9916436320544126, 'bleu3': 0.991004109199491, 'bleu4': 0.9901868352060816, 'Runtime': 0.3363892316818237}


100%|██████████| 10/10 [00:03<00:00,  2.87it/s]


{'Precision': 0.11481481481481481, 'Recall': 0.93, 'F1 Score': 0.2043956043956044, 'bleu1': 0.9921655844155844, 'bleu2': 0.9916436320544126, 'bleu3': 0.991004109199491, 'bleu4': 0.9901868352060816, 'Runtime': 0.34853320121765136}


100%|██████████| 10/10 [00:03<00:00,  2.96it/s]


{'Precision': 0.11892583120204604, 'Recall': 0.93, 'F1 Score': 0.21088435374149658, 'bleu1': 0.9921655844155844, 'bleu2': 0.9916436320544126, 'bleu3': 0.991004109199491, 'bleu4': 0.9901868352060816, 'Runtime': 0.3370701313018799}


100%|██████████| 10/10 [00:03<00:00,  2.82it/s]


{'Precision': 0.12550607287449392, 'Recall': 0.93, 'F1 Score': 0.22116527942925088, 'bleu1': 0.9921655844155844, 'bleu2': 0.9916436320544126, 'bleu3': 0.991004109199491, 'bleu4': 0.9901868352060816, 'Runtime': 0.3537137985229492}


100%|██████████| 10/10 [00:03<00:00,  2.83it/s]


{'Precision': 0.13237410071942446, 'Recall': 0.92, 'F1 Score': 0.23144654088050315, 'bleu1': 0.9913322510822511, 'bleu2': 0.9907723413461653, 'bleu3': 0.9900897121636515, 'bleu4': 0.9892228552421801, 'Runtime': 0.35247604846954345}


100%|██████████| 10/10 [00:03<00:00,  2.72it/s]


{'Precision': 0.13740458015267176, 'Recall': 0.9, 'F1 Score': 0.2384105960264901, 'bleu1': 0.9866186449874319, 'bleu2': 0.9857661088408443, 'bleu3': 0.9847101373677408, 'bleu4': 0.9833343170867255, 'Runtime': 0.3673586130142212}


100%|██████████| 10/10 [00:03<00:00,  2.67it/s]


{'Precision': 0.1461038961038961, 'Recall': 0.9, 'F1 Score': 0.2513966480446927, 'bleu1': 0.9866186449874319, 'bleu2': 0.9857661088408443, 'bleu3': 0.9847101373677408, 'bleu4': 0.9833343170867255, 'Runtime': 0.3746285676956177}


100%|██████████| 10/10 [00:03<00:00,  2.62it/s]


{'Precision': 0.39, 'Recall': 0.78, 'F1 Score': 0.5200000000000001, 'bleu1': 0.9048744971981375, 'bleu2': 0.8896128849904242, 'bleu3': 0.8722066185527999, 'bleu4': 0.8533754233449994, 'Runtime': 0.38082983493804934}


100%|██████████| 10/10 [00:03<00:00,  2.95it/s]


{'Precision': 0.6052631578947368, 'Recall': 0.69, 'F1 Score': 0.6448598130841122, 'bleu1': 0.8037254109534087, 'bleu2': 0.7789657588275908, 'bleu3': 0.7522797754762613, 'bleu4': 0.7346664405087635, 'Runtime': 0.3379930019378662}


100%|██████████| 10/10 [00:03<00:00,  2.76it/s]


{'Precision': 0.7466666666666667, 'Recall': 0.56, 'F1 Score': 0.6400000000000001, 'bleu1': 0.7055459759285828, 'bleu2': 0.672738666539556, 'bleu3': 0.6345097662482369, 'bleu4': 0.6063735705543454, 'Runtime': 0.36239233016967776}


100%|██████████| 10/10 [00:03<00:00,  2.60it/s]


{'Precision': 0.6451612903225806, 'Recall': 0.2, 'F1 Score': 0.30534351145038163, 'bleu1': 0.48125943559400136, 'bleu2': 0.4099488982393146, 'bleu3': 0.3616004982584514, 'bleu4': 0.33162421874276204, 'Runtime': 0.3836843252182007}


100%|██████████| 10/10 [00:03<00:00,  2.64it/s]


{'Precision': 1.0, 'Recall': 0.01, 'F1 Score': 0.019801980198019802, 'bleu1': 0.14078107292873082, 'bleu2': 0.06784549570954383, 'bleu3': 0.05054231645060737, 'bleu4': 0.037295369118533546, 'Runtime': 0.3785877227783203}


100%|██████████| 10/10 [00:03<00:00,  2.90it/s]

{'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1, 'Runtime': 0.3439875841140747}





In [9]:
res, pred = evaluate("zero-few-shot", None, False, True)
overall_result[("zero-few-shot", None)] = res

100%|██████████| 10/10 [00:26<00:00,  2.63s/it]


{'Precision': 0.7241379310344828, 'Recall': 0.42, 'F1 Score': 0.5316455696202531, 'bleu1': 0.6686946667770143, 'bleu2': 0.6088971052118335, 'bleu3': 0.5560174999684565, 'bleu4': 0.5190346640167757, 'Runtime': 2.6299542665481566}


In [10]:
for thresh in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    res, _ = evaluate("both", thresh, False, True)
    overall_result[("both", thresh)] = res
    if res["Recall"] == 0.0:
        break

100%|██████████| 10/10 [11:16<00:00, 67.68s/it]


{'Precision': 0.8333333333333334, 'Recall': 0.45, 'F1 Score': 0.5844155844155844, 'bleu1': 0.6794564658436427, 'bleu2': 0.6201113360556711, 'bleu3': 0.569423208084529, 'bleu4': 0.5369079675514906, 'Runtime': 67.67934219837188}


100%|██████████| 10/10 [03:38<00:00, 21.86s/it]


{'Precision': 0.803921568627451, 'Recall': 0.41, 'F1 Score': 0.543046357615894, 'bleu1': 0.6541716633060619, 'bleu2': 0.6027290750611993, 'bleu3': 0.5553724229062721, 'bleu4': 0.5206783911403058, 'Runtime': 21.86234359741211}


100%|██████████| 10/10 [02:10<00:00, 13.01s/it]


{'Precision': 0.8048780487804879, 'Recall': 0.33, 'F1 Score': 0.4680851063829787, 'bleu1': 0.5877178817609843, 'bleu2': 0.5244069924925503, 'bleu3': 0.4701255680455004, 'bleu4': 0.4320213766466749, 'Runtime': 13.010152983665467}


100%|██████████| 10/10 [01:26<00:00,  8.67s/it]


{'Precision': 0.9310344827586207, 'Recall': 0.27, 'F1 Score': 0.4186046511627907, 'bleu1': 0.4960495878448114, 'bleu2': 0.4264709319702338, 'bleu3': 0.3716630272788912, 'bleu4': 0.33507127083178195, 'Runtime': 8.670647740364075}


100%|██████████| 10/10 [00:36<00:00,  3.70s/it]


{'Precision': 0.7222222222222222, 'Recall': 0.13, 'F1 Score': 0.22033898305084748, 'bleu1': 0.4184024319273668, 'bleu2': 0.334317327567578, 'bleu3': 0.28064236169840034, 'bleu4': 0.24135212969021438, 'Runtime': 3.6956775903701784}


100%|██████████| 10/10 [00:04<00:00,  2.30it/s]

{'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1, 'Runtime': 0.4351715803146362}





In [21]:
df_results = pd.DataFrame([{"method": k[0], "threshold": k[1]} | v for k,v in overall_result.items()])
df_results.head()

Unnamed: 0,method,threshold,Precision,Recall,F1 Score,bleu1,bleu2,bleu3,bleu4,Runtime
0,threshold,0.01,0.106754,0.98,0.192534,0.998091,0.99799,0.997872,0.997733,0.415942
1,threshold,0.02,0.107383,0.96,0.193159,0.995851,0.995486,0.995163,0.994755,0.331109
2,threshold,0.03,0.10984,0.96,0.197125,0.995851,0.995486,0.995163,0.994755,0.335672
3,threshold,0.04,0.11019,0.93,0.197034,0.992166,0.991644,0.991004,0.990187,0.336389
4,threshold,0.05,0.114815,0.93,0.204396,0.992166,0.991644,0.991004,0.990187,0.348533


In [22]:
df_results.to_csv("results/EvaluateSemanticFiltering.csv")