# Evaluate Similarity Grouping


In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from models import ModelMgr
from models.embedding.SentenceTransformer import SentenceTransformerEmbeddingModel
from models.semantic_validation import LLaMAValidationModel

from db.operators import Dummy, Select
from db.criteria import SoftEqual
from db.structure import Column, Constant
import kagglehub

from evaluation.util import calculate_metrics, calc_bleu

import time

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nico\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Nico\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
stem = SentenceTransformerEmbeddingModel(ModelMgr())
lsv = LLaMAValidationModel(ModelMgr())
# lsv = DeepSeekValidationModel(m)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
ZERO_SHOT_SYSTEM_PROMPT = "You are a validator. Respond with \"no\" and \"yes\" only!"
ZERO_SHOT_PROMPTING_TEMPLATE = 'Does "{}" describes "{}"'

random_state = 420
max_number_categories = 100

target = "category_2"

In [4]:
path = kagglehub.dataset_download("lakritidis/product-classification-and-categorization")

df = pd.read_csv(f"{path}/pricerunner_aggregate.csv", header=None, index_col=0)
df.drop(columns=[2, 3, 5], inplace=True)
df.columns = ["product", "category_1", "category_2"]
df.head()



Unnamed: 0_level_0,product,category_1,category_2
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,apple iphone 8 plus 64gb silver,Apple iPhone 8 Plus 64GB,Mobile Phones
2,apple iphone 8 plus 64 gb spacegrau,Apple iPhone 8 Plus 64GB,Mobile Phones
3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,Apple iPhone 8 Plus 64GB,Mobile Phones
4,apple iphone 8 plus 64gb space grey,Apple iPhone 8 Plus 64GB,Mobile Phones
5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,Apple iPhone 8 Plus 64GB,Mobile Phones


In [6]:
categories = pd.Series(df[target].unique())
categories = categories.sample(min(len(categories), max_number_categories)).tolist()

print(len(categories)), print(categories)

10
['Fridge Freezers', 'Digital Cameras', 'Freezers', 'TVs', 'Mobile Phones', 'Dishwashers', 'Microwaves', 'Washing Machines', 'CPUs', 'Fridges']


(None, None)

In [7]:
df_reduced = pd.concat([df[df[target] == category].sample(1, random_state=random_state) for category in categories])
gt = {(x["product"], x[target]) for key, x in df_reduced.iterrows()}
print(str(gt)[0:100], "...", len(gt))

{('bosch einbausp ler sbv88tx36e vollintegriert', 'Dishwashers'), ('hewlett packard enterprise intel ... 10


In [8]:
overall_result = {}

def evaluate(method, threshold, system_prompt=ZERO_SHOT_SYSTEM_PROMPT, prompt_template=ZERO_SHOT_PROMPTING_TEMPLATE):
    pred = []
    runtimes = []
    for category in tqdm(categories):
        d = Dummy("products", ["product"], [(x[0], ) for x in gt])
        s = Select(d, SoftEqual(Column("product"),Constant(category), method=method, em=stem, sv=lsv, threshold=threshold, zfs_system_prompt=system_prompt, zfs_prompt_template = prompt_template))

        tic = time.time()
        result = s.open().fetch_all()
        toc = time.time()

        pred.extend([(x["product"], category) for x in result])
        runtimes.append(toc - tic)

    scores = calculate_metrics(gt, set(pred), np.mean(runtimes))

    print(method, threshold, scores["F1 Score"])

    return scores, pred

In [9]:
for thresh in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    res, _ = evaluate("threshold", thresh)
    overall_result[("threshold", thresh)] = res
    if res["Recall"] == 0.0:
        break

100%|██████████| 10/10 [00:01<00:00,  8.97it/s]


threshold 0.1 0.2647058823529411


100%|██████████| 10/10 [00:00<00:00, 13.64it/s]


threshold 0.2 0.5294117647058825


100%|██████████| 10/10 [00:00<00:00, 14.04it/s]


threshold 0.3 0.6666666666666666


100%|██████████| 10/10 [00:00<00:00, 13.78it/s]


threshold 0.4 0.7


100%|██████████| 10/10 [00:00<00:00, 12.86it/s]


threshold 0.5 0.16666666666666669


100%|██████████| 10/10 [00:00<00:00, 12.82it/s]

threshold 0.6 0





In [10]:
res, pred = evaluate("zero-few-shot", None)
overall_result[("zero-few-shot", None)] = res

100%|██████████| 10/10 [00:05<00:00,  1.81it/s]

zero-few-shot None 0.6666666666666666





In [11]:
for thresh in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    res, _ = evaluate("both", thresh)
    overall_result[("both", thresh)] = res
    if res["Recall"] == 0.0:
        break

100%|██████████| 10/10 [00:02<00:00,  4.09it/s]


both 0.1 0.6666666666666666


100%|██████████| 10/10 [00:01<00:00,  7.08it/s]


both 0.2 0.6666666666666666


100%|██████████| 10/10 [00:01<00:00,  8.84it/s]


both 0.3 0.5714285714285715


100%|██████████| 10/10 [00:01<00:00,  9.73it/s]


both 0.4 0.4615384615384615


100%|██████████| 10/10 [00:00<00:00, 12.35it/s]

both 0.5 0





In [12]:
for key in tqdm(overall_result):
    scores_bleu = calc_bleu(gt, overall_result[key]["pred"])
    for score_bleu in scores_bleu:
        overall_result[key][score_bleu] = scores_bleu[score_bleu]

100%|██████████| 12/12 [00:00<00:00, 47.08it/s]


In [13]:
df_results = pd.DataFrame([{"method": k[0], "threshold": k[1]} | v for k,v in overall_result.items()]).drop(columns=["pred"])
df_results

Unnamed: 0,method,threshold,Precision,Recall,F1 Score,tp,fn,fp,runtime,bleu1,bleu2,bleu3,bleu4
0,threshold,0.1,0.155172,0.9,0.264706,9,1,49,0.111067,0.985714,0.984515,0.982983,0.980911
1,threshold,0.2,0.375,0.9,0.529412,9,1,15,0.072988,0.916135,0.912323,0.905556,0.903945
2,threshold,0.3,0.571429,0.8,0.666667,8,2,6,0.070779,0.826786,0.809105,0.806724,0.806081
3,threshold,0.4,0.7,0.7,0.7,7,3,3,0.072155,0.734478,0.711637,0.708523,0.707634
4,threshold,0.5,0.5,0.1,0.166667,1,9,1,0.07764,0.288132,0.213444,0.204069,0.200907
5,threshold,0.6,0.0,0.0,0.0,0,10,0,0.077581,-1.0,-1.0,-1.0,-1.0
6,zero-few-shot,,1.0,0.5,0.666667,5,5,0,0.551632,0.59591,0.542653,0.531649,0.520222
7,both,0.1,1.0,0.5,0.666667,5,5,0,0.243688,0.59591,0.542653,0.531649,0.520222
8,both,0.2,1.0,0.5,0.666667,5,5,0,0.141115,0.59591,0.542653,0.531649,0.520222
9,both,0.3,1.0,0.4,0.571429,4,6,0,0.112696,0.50841,0.446879,0.434748,0.422999


In [15]:
df_results.to_csv("results/Products_mpnetBaseV2_LLama3B.csv")