In [72]:
import os
import pandas as pd

In [73]:
def find_lostart_csv(id: int):
    """
    Find the csv file containing the Lost Art ID and return the corresponding dataframe.
    """
    for csv in os.listdir("data/lostart"):
        df = pd.read_csv(f"data/lostart/{csv}", sep=";")

        if df.loc[df["Lost Art ID"] == id].shape[0] > 0:
            break
    
    return csv, df.loc[df["Lost Art ID"] == id]

def remove_leakage(df: pd.DataFrame):
    """
    Remove leakage columns from the dataframe.
    """
    return df.drop(columns=["Inventarnummer/Signatur", "Provenienz", "Literatur / Quelle"])

def get_concatenated_txt(series: pd.Series):
    """
    Concatenate the non-NaN values of a series into a string.
    """
    result = ""
    
    for idx, value in series.items():
        if pd.notna(value):
            result += f"{idx}: {value} "
            result += "\n"
        else:
            result += f"{idx}: "
            result += "\n"
            
    return result

# Lost Art ID: 589708
_, lostart = find_lostart_csv(589708)
lostart = remove_leakage(lostart)
lostart_txt = get_concatenated_txt(lostart.squeeze())

In [74]:
# Load mnr data
pop = pd.read_excel("data/mnr_20250303_17h40m54s.ods")

# Remove potential leakage columns

def remove_leakage_mnr(df: pd.DataFrame):
    """
    Remove leakage columns from the dataframe.
    """
    leakage_cols = ["HIST4", "LOCA", "NOTE"]
    df = df.drop(columns=leakage_cols)

    unecessary_cols = ["POP_IMPORT", "VIDEO"]
    df = df.drop(columns=unecessary_cols)
    return df


def add_column_with_concatenated_txt(df: pd.DataFrame):
    """
    Add a new column to the dataframe containing the concatenated text of the other columns.
    """
    df["CONCATENATED"] = df.apply(lambda row: get_concatenated_txt(row), axis=1)
    return df

pop = remove_leakage_mnr(pop)
pop = add_column_with_concatenated_txt(pop)

In [75]:
from Embedding import EmbeddingFromPretrained

# model_name="sentence-transformers/all-mpnet-base-v2"
emb = EmbeddingFromPretrained()

# Get the embedding of the Lost Art text
lostart_emb = emb.get_mean_pooling_embedding(lostart_txt)
lostart_emb

tensor([[-1.5780e-01,  1.1512e-01,  1.6125e-02,  1.3090e-02, -5.2402e-02,
          1.5398e-01, -3.0527e-02,  1.0796e-01, -4.8301e-02, -9.5199e-02,
         -4.7553e-02, -6.7236e-02,  2.1684e-02, -4.3447e-02, -2.1051e-01,
         -5.2413e-02, -8.8275e-02,  4.3779e-02,  4.6275e-02,  4.3512e-02,
         -7.8799e-02, -6.6008e-02,  3.8625e-02,  9.1215e-02, -3.5481e-02,
         -3.0292e-02, -1.0008e-01,  6.3713e-03, -4.1287e-02, -2.1866e-02,
         -2.4563e-02,  9.7806e-02, -2.9754e-02, -4.9153e-02,  6.1578e-02,
          5.0376e-02, -4.4859e-02, -1.4834e-02, -2.9136e-02, -3.6950e-02,
         -2.5866e-02, -7.7251e-02, -1.3000e-01,  3.3486e-02,  6.5007e-02,
          7.3249e-03,  2.5164e-02,  6.6596e-02, -9.8494e-02,  1.9221e-02,
         -1.6584e-01,  6.9362e-02, -1.4390e-02, -1.1004e-01, -4.5777e-02,
          3.7522e-02,  2.0026e-02, -1.5729e-03,  6.1697e-02, -2.1725e-02,
          1.2318e-01, -3.7500e-02, -1.1722e-01, -1.8104e-02, -3.8951e-02,
         -9.6163e-02, -7.5673e-02,  2.

In [78]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def get_most_similar_text(emb: torch.tensor, embedder, pop: pd.DataFrame):
    """
    Get the most similar text to the given embedding in the given dataframe.
    """

    similarities = pop["CONCATENATED"].apply(lambda x: cosine_similarity(emb, embedder.get_mean_pooling_embedding(x)).item())
    # top 5 most similar
    return pop.loc[similarities.nlargest(10).index]

most_similar = get_most_similar_text(lostart_emb, emb, pop)

In [79]:
most_similar

Unnamed: 0,REF,POP_COMMENTAIRES,POP_FLAGS,AATT,AFFE,ATIT,ATTR,AUTI,AUTR,BASE,...,REPR,RESUME,SALLES,SCLE,SREP,STYL,SUITE,TECH,TITR,CONCATENATED
1044,JDP-0051,,,,,,,Paysage avec vase de ChrysanthèmesLes Quatre âges,BAUCHANT André,Rose Valland (MNR-Jeu de Paume),...,,,,,,,,,Dahlias au vase rose,REF: JDP-0051 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
2071,MNR00240,,,,Paris ; musée du Louvre ; département des Pein...,Tournoi,MAITRE DE LA NATIVITE DE KARLSRUHE. MAITRE DES...,,Anonyme,Rose Valland (MNR-Jeu de Paume),...,"scène (bataille, Moyen Age, équestre, cheval, ...",,"26/06/2019 : au rez-de-chaussée, salle Linet",15e siècle,,,,Bois;peinture,Histoire de Camille (panneau de cassone),REF: MNR00240 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
359,MNR00089,,,,,Portrait d'hommePortrait d'un architecte,RIVALZ ?,,LONGHI Alessandro,Rose Valland (MNR-Jeu de Paume),...,"portrait (Ferracina Bartolomeo, homme, de troi...",,,18e siècle,,,,Toile;peinture à l'huile,Portrait de Bartolomeo Ferracina,REF: MNR00089 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
1056,JDP-0076,,,,,,,,PISSARRO Camille,Rose Valland (MNR-Jeu de Paume),...,,,,,,,,,Chemin à travers champs,REF: JDP-0076 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
1830,MNAM0012,,,,Paris ; musée national d'art moderne,Vue de neige#Rue sous la neige,,,UTRILLO Maurice,Rose Valland (MNR-Jeu de Paume),...,,,,1er quart 20e siècle,,,,Toile;peinture à l'huile,Rue de village sous la neige,REF: MNAM0012 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
1086,JDP-0079,,,,,,,,PICASSO Pablo,Rose Valland (MNR-Jeu de Paume),...,,,,,,,,,Pomme,REF: JDP-0079 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
1667,ATP00010,,,,Marseille ; musée des civilisations de l'Europ...,,,,Anonyme,Rose Valland (MNR-Jeu de Paume),...,,,,?,,,,Cuivre sur âme acier;gravure,Plaque pour frapper les tissus,REF: ATP00010 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
1001,JDP-0016,,,,,,,Sémaphore avec tourelle pour drapeauGrands bât...,DE CHIRICO Giorgio,Rose Valland (MNR-Jeu de Paume),...,,,,,,,,,Le Jour de fête,REF: JDP-0016 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
1669,ATP00012,,,,Marseille ; musée des civilisations de l'Europ...,,,,Anonyme,Rose Valland (MNR-Jeu de Paume),...,,,,18e siècle,,,,Bois (acajou ?);cuivre repoussé;gravure,Canne,REF: ATP00012 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
1071,JDP-0080,,,,,,,Effet de soir,COROT Camille,Rose Valland (MNR-Jeu de Paume),...,,,,,,,,,Paysage de forêt,REF: JDP-0080 \nPOP_COMMENTAIRES: \nPOP_FLAGS:...
