In [15]:
import pandas as pd
from PIL import Image

import sys
import os


notebook_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)

from src.utils.download import download_image_in_memory, download_slider_from_page, download_images_from_page
from src.utils.processing_df import find_lostart_csvs

### Download Lost Art Images

In [18]:
found_lost_art = [589707, 589708, 614072, 526702, 567247, 429210]

# Find csv
df = find_lostart_csvs(found_lost_art)

for item in df[["Lost Art ID", "Link"]].values:
    download_images_from_page(item[1], item[0], download_dir="../data/images/lostart")


Image downloaded and saved as ../data/images/lostart/lostart/589707.jpg
Image downloaded and saved as ../data/images/lostart/lostart/589708.jpg
Image downloaded and saved as ../data/images/lostart/lostart/614072.jpg
Image downloaded and saved as ../data/images/lostart/lostart/526702.jpg
Image downloaded and saved as ../data/images/lostart/lostart/567247.jpg
Image downloaded and saved as ../data/images/lostart/lostart/429210.jpg


In [6]:
mnr = pd.read_excel("../data/mnr_20250303.ods")
mnr.head()

Unnamed: 0,REF,POP_COMMENTAIRES,POP_FLAGS,POP_IMPORT,AATT,AFFE,ATIT,ATTR,AUTI,AUTR,...,REPR,RESUME,SALLES,SCLE,SREP,STYL,SUITE,TECH,TITR,VIDEO
0,MNR00329,,,6319fd4a4e2f920b275837a5;6319fd4a4e2f920b27583...,,Paris ; musée du Louvre ; département des Pein...,Saint Jérôme ; Saint Antoine,,,RIBERA Jusepe de (d'après),...,"figure (saint Jérôme, pénitence, crâne, nudité)",,,17e siècle,,,,Toile;peinture à l'huile,Saint Jérôme pénitent,mnr/MNR00329/MNR329-copyright-RMN-Grand_Palais...
1,MNR00330,,,6319fd4a4e2f920b275837a5;6319fd4a4e2f920b27583...,,Paris ; musée du Louvre ; département des Pein...,Bethléem,,,COLLANTES Francisco,...,"scène biblique (Sainte Famille, arrivée, Bethl...",,,17e siècle,Bible : Nouveau Testament,,,Toile;peinture à l'huile,Arrivée à Bethléem,mnr/MNR00330/MNR330_CopyrightRMN-GP_Franck-Rau...
2,MNR00331,,,6319fd4a4e2f920b275837a5;6319fd4a4e2f920b27583...,,Paris ; musée du Louvre ; département des Pein...,Les Beautés au balcon#Les Belles au balcon,,,"LUCAS Y VELAZQUEZ, Eugenio (autrefois LUCAS Y ...",...,"scène (homme : assis, femme : assis, espagnol,...",,12/09/2019 : oui,19e siècle,,,,Toile;peinture à l'huile,Hommes et femmes espagnols à une tribune,mnr/MNR00331/MNR331_copyright-MuseeduLouvre_1....
3,MNR00332,,,6319fd4a4e2f920b275837a5;6319fd4a4e2f920b27583...,,Paris ; musée du Louvre ; département des Pein...,Évêque avec sainte Agnès#Composition religieus...,VALDES Léal ; SHUT Cornelis.,,THULDEN Théodore van ? Anonyme,...,"figures (saint François d'Assise, saint August...",,"25/09/2019 : non, mais sera exposé après rénov...",17e siècle,,,,Toile;peinture à l'huile,"Religieuse avec saint Augustin, sainte Agnès e...",mnr/MNR00332/MNR332-copyright-Musee_Goya_Castr...
4,MNR00333,,,6319fd4a4e2f920b275837a5;6319fd4a4e2f920b27583...,,Paris ; musée du Louvre ; département des Pein...,Portrait d'homme puis Portrait d'un écrivain,"RAEBURN, Sir Henry",,REYNOLDS Sir Joshua,...,"portrait (Chambers William, homme, en buste, d...",,,18e siècle,,,,Bois;peinture à l'huile,Portrait de Sir William Chambers (1723-1796),mnr/MNR00333/MNR333-copyright_Musees_de_la_vil...


In [None]:
for code in mnr["REF"]:
    print(code)
    link = f"https://pop.culture.gouv.fr/notice/mnr/{code}"
    download_slider_from_page(link, code, download_dir="../data/images", only_first=True, verbose=True)

In [7]:
from src.Embedding import ImageEmbeddingFromPretrained

model = ImageEmbeddingFromPretrained()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [24]:
import os
from pathlib import Path
import torch
from PIL import Image, UnidentifiedImageError

embeddings = []
refs = []

img_dir = Path("../data/images/mnr")
for img_path in list(img_dir.glob("*.jpg")):
    try:
        # Sometimes the image is corrupted because we fixed a limit of time for the download
        embedding = model.get_cls_embedding(str(img_path))
        embeddings.append(embedding)
        refs.append(img_path.stem) # stem = filename without suffix
    except (OSError, UnidentifiedImageError) as e:
        print(f"Image corrompue ignorée : {img_path.name} ({e})")


embeddings_tensor = torch.stack(embeddings)  # shape (N, 1, D)

save_dict = {
    "refs": refs,
    "embeddings": embeddings_tensor
}

torch.save(save_dict, "../data/mnr_embeddings.pt")

Image corrompue ignorée : MNR00342.jpg (image file is truncated (1 bytes not processed))
Image corrompue ignorée : MNR00341.jpg (image file is truncated (5 bytes not processed))
Image corrompue ignorée : OAR00366.jpg (cannot identify image file '../data/images/mnr/OAR00366.jpg')
Image corrompue ignorée : OAR00274.jpg (image file is truncated (41 bytes not processed))
Image corrompue ignorée : OAR00516.jpg (image file is truncated (63 bytes not processed))
Image corrompue ignorée : MNR00333.jpg (image file is truncated (42 bytes not processed))
Image corrompue ignorée : OAR00517.jpg (image file is truncated (63 bytes not processed))




Image corrompue ignorée : OAR00518.jpg (image file is truncated (63 bytes not processed))
Image corrompue ignorée : OAR00256.jpg (image file is truncated (11 bytes not processed))
Image corrompue ignorée : OAR00283.jpg (image file is truncated (75 bytes not processed))
Image corrompue ignorée : OAR00241.jpg (image file is truncated (33 bytes not processed))
Image corrompue ignorée : OAR00250.jpg (image file is truncated (50 bytes not processed))




Image corrompue ignorée : MNR00339.jpg (image file is truncated (1 bytes not processed))
Image corrompue ignorée : MNR00377.jpg (image file is truncated (53 bytes not processed))
Image corrompue ignorée : OAR00425.jpg (cannot identify image file '../data/images/mnr/OAR00425.jpg')
Image corrompue ignorée : OAR00357.jpg (image file is truncated (43 bytes not processed))
Image corrompue ignorée : OAR00424.jpg (cannot identify image file '../data/images/mnr/OAR00424.jpg')
Image corrompue ignorée : OAR00342.jpg (image file is truncated (29 bytes not processed))
Image corrompue ignorée : OAR00232.jpg (image file is truncated (55 bytes not processed))
Image corrompue ignorée : OAR00233.jpg (image file is truncated (55 bytes not processed))
Image corrompue ignorée : OAR00423.jpg (cannot identify image file '../data/images/mnr/OAR00423.jpg')
Image corrompue ignorée : MNR00372.jpg (image file is truncated (19 bytes not processed))
Image corrompue ignorée : OAR00230.jpg (image file is truncated (

In [27]:
data = torch.load("../data/mnr_cls_embeddings.pt")
refs = data["refs"]
embeddings = data["embeddings"]  # shape (N, D)

# print(refs)
print(f"Shape of embeddings: {embeddings.shape}")  # (N, 1, D)

Shape of embeddings: torch.Size([1586, 1, 384])
