In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from matching.config import Settings

import matplotlib.pyplot as plt

import seaborn as sns

# Naïve Matching baseline

This is a simple baseline using a rule-based approach to matching of cultural assets.


In [None]:
data = pd.read_csv("../../data/labelled_data.csv")

data.describe()

In [None]:
for idx, row in data.iterrows():
    if row["1_collectedIn_name"] == "Linzer Sammlung":
        data.at[idx, "1_createdBy_name"] = (
            f"{row['1_createdBy_firstName']} {row['1_createdBy_lastName']}"
        )
    if row["2_collectedIn_name"] == "Linzer Sammlung":
        data.at[idx, "2_createdBy_name"] = (
            f"{row['2_createdBy_firstName']} {row['2_createdBy_lastName']}"
        )

In [None]:
def is_naive_match(entity1: pd.DataFrame, entity2: pd.DataFrame):
    return (
        entity1["1_physicalDescription"] == entity2["2_physicalDescription"]
        or entity1["1_title"] == entity2["2_title"]
    )

In [None]:
precision_list = []
recall_list = []

for seed in Settings.random_seeds():
    traindata, testdata = train_test_split(data, test_size=0.2, random_state=seed)

    testdata["predictedLabel"] = np.nan

    predictions = []
    labels = []

    for idx, row in testdata.iterrows():
        entity1 = row[[col for col in testdata.columns if col.startswith("1_")]]
        entity2 = row[[col for col in testdata.columns if col.startswith("2_")]]

        label = is_naive_match(entity1, entity2)

        predictions.append(1 if label else 0)
        labels.append(row["label"])

    print(f"Seed: {seed}")
    print(f"Number of predicted matches: {sum(predictions)}")
    print(f"Number of actual matches: {sum(labels)}")
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    precision_list.append(precision)
    recall_list.append(recall)

In [None]:
avg_precision_with_outliers = np.mean(precision_list)
avg_recall_with_outliers = np.mean(recall_list)

print(f"Average Precision: {avg_precision_with_outliers}")
print(f"Average Recall: {avg_recall_with_outliers}")

sns.histplot(precision_list)
plt.xlabel("Precision Score")

In [None]:
f1_score = (
    2
    * (avg_precision_with_outliers * avg_recall_with_outliers)
    / (avg_precision_with_outliers + avg_recall_with_outliers)
)

print(f"F1 Score: {f1_score}")

In [None]:
print(
    "F1 Score",
    "{:0.3f}".format(f1_score),
    "\nAverage precision:",
    "{:0.3f}".format(avg_precision_with_outliers),
    "\nAverage recall:",
    "{:0.3f}".format(avg_recall_with_outliers),
)