In [1]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import wandb
import os
wandb.init(project="product-matching-finetune", name="mpnet-finetune")


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\naeko\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mnaekorashi[0m ([33mnaekorashi-smkn-3-bandung[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
df = [
    ("nivea pearl beauty 150ml", "nivea micellair pearl bright 125ml", 0),
    ("lifebuoy sabun cair 450ml", "lifebuoy body wash 450 ml", 1),
]
df = pd.DataFrame(df,columns=["ocr_product", "pusat_product", "label"])

In [7]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])

train_examples = [InputExample(texts=[row['ocr_product'], row['pusat_product']], label=float(row['label'])) for idx, row in train_df.iterrows()]
test_examples = [InputExample(texts=[row['ocr_product'], row['pusat_product']], label=float(row['label'])) for idx, row in test_df.iterrows()]


KeyError: 'label'

In [None]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)


In [None]:
sentences1 = [example.texts[0] for example in test_examples]
sentences2 = [example.texts[1] for example in test_examples]
labels = [example.label for example in test_examples]

evaluator = BinaryClassificationEvaluator(sentences1, sentences2, labels)


In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=3,
    warmup_steps=100,
    output_path='./fine-tuned-product-mpnet'
)


In [None]:
embeddings1 = model.encode(sentences1)
embeddings2 = model.encode(sentences2)

from sentence_transformers.util import cos_sim
predictions = [cos_sim(e1, e2).item() for e1, e2 in zip(embeddings1, embeddings2)]


In [None]:
threshold = 0.6
predicted_labels = [1 if p >= threshold else 0 for p in predictions]


In [None]:
accuracy = accuracy_score(labels, predicted_labels)
precision = precision_score(labels, predicted_labels)
recall = recall_score(labels, predicted_labels)
f1 = f1_score(labels, predicted_labels)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
wandb.log({
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1
})

wandb.finish()
