In [19]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
import os
from transformers import CLIPProcessor, CLIPModel
import torch

In [2]:
df = pd.read_csv('./datasets/exhibition.csv')

In [3]:
df

Unnamed: 0,systemNumber,accessionNumber,objectType,_currentLocation_id,_currentLocation_displayName,_currentLocation_type,_currentLocation_site,_currentLocation_onDisplay,_currentLocation_detail_free,_currentLocation_detail_case,...,marksAndInscriptions_2_note,dimensions_6_dimension,dimensions_6_value,dimensions_6_unit,dimensions_6_qualifier,dimensions_6_date_text,dimensions_6_date_earliest,dimensions_6_date_latest,dimensions_6_part,dimensions_6_note
0,O22904,662 to C-1903,Teapot,THES49865,"Ceramics, Room 145",display,VA,True,,49,...,,,,,,,,,,
1,O187757,C.326-1910,Dish,THES49876,"Ceramics, Room 137, The Curtain Foundation Gal...",display,VA,True,,15,...,,,,,,,,,,
2,O185533,683-1902,Vase,THES49877,"Ceramics, Room 136, The Curtain Foundation Gal...",display,VA,True,,10,...,,,,,,,,,,
3,O162180,C.240-1909,Tankard,THES49787,"Europe 1600-1815, Room 5, The Friends of the V...",display,VA,True,,CA1,...,,,,,,,,,,
4,O151257,3096-1852,Pickle dish,THES49875,"Ceramics, Room 138, The Harry and Carol Djanog...",display,VA,True,,6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1945,O157075,LOAN:GILBERT.988-2008,Pot,THES49618,In store,storage,VA,False,,,...,,,,,,,,,,
1946,O148514,3722-1901,Tyg,THES49875,"Ceramics, Room 138, The Harry and Carol Djanog...",display,VA,True,,A,...,,,,,,,,,,
1947,O8072,3839-1901,Mug,THES49242,"British Galleries, Room 56, The Djanogly Gallery",display,VA,True,,CA13,...,,,,,,,,,,
1948,O70493,240-1877,Tea bowl,THES49867,"Ceramics, Room 143, The Timothy Sainsbury Gallery",display,VA,True,,20,...,,,,,,,,,,


In [5]:
# 得到图文数据
df_brief = df[['systemNumber', '_images__primary_thumbnail', 'briefDescription']]

In [6]:
df_brief

Unnamed: 0,systemNumber,_images__primary_thumbnail,briefDescription
0,O22904,https://framemark.vam.ac.uk/collections/2006AW...,"Yixing ware. Teapot, brown stoneware in the fo..."
1,O187757,https://framemark.vam.ac.uk/collections/2010EB...,"Yixing ware. Dish, in the form of a peach with..."
2,O185533,https://framemark.vam.ac.uk/collections/2010EB...,"Yixing ware. Vase, brown stoneware with mottle..."
3,O162180,https://framemark.vam.ac.uk/collections/2014GY...,Salt-glazed stoneware tankard with pewter moun...
4,O151257,https://framemark.vam.ac.uk/collections/2008BV...,
...,...,...,...
1945,O157075,https://framemark.vam.ac.uk/collections/2009CP...,"Silver, parcel-gilt, London, (no hallmarks), c..."
1946,O148514,https://framemark.vam.ac.uk/collections/2008BT...,Red earthenware tyg with relief decoration in ...
1947,O8072,https://framemark.vam.ac.uk/collections/2006AJ...,"Delftware mug, painted in blue with birds, flo..."
1948,O70493,https://framemark.vam.ac.uk/collections/2008BT...,"Tea bowl, Raku ware, Kyoto, attributed to Raku..."


In [7]:
save_dir = 'downloaded_images'
os.makedirs(save_dir, exist_ok=True)

In [8]:
# 下载图片
def download_and_save_image(url, save_path):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
        else:
            print(f"Failed to download image from {url}")
            return False
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")
        return False

In [9]:
# 检查图片是否已下载
def has_image_been_downloaded(save_path):
    """Check if the image file already exists at the save path."""
    return os.path.isfile(save_path)

In [12]:
image_paths = []

for idx, url in enumerate(df['_images__primary_thumbnail']):
    if pd.notna(url): 
        save_path = os.path.join(save_dir, f"{df.iloc[idx]['systemNumber']}.jpg")
    
        # 检查图片是否已下载
        if has_image_been_downloaded(save_path):
            # print(f"Image already exists: {save_path}")
            image_paths.append(save_path)
        else:
            if download_and_save_image(url, save_path):
                image_paths.append(save_path)
            else:
                image_paths.append(None) 
    else:
        image_paths.append(None)

In [13]:
file_paths = []
file_path_dict = {}

for root, dirs, files in os.walk(save_dir):
    for file in files:
        if file.endswith('.jpg'):
            system_number = os.path.splitext(file)[0]
            full_path = os.path.join(root, file)
            file_path_dict[system_number] = full_path

# 本地路径加入到df中
df_brief['local_image_path'] = df_brief['systemNumber'].map(file_path_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == "":


In [14]:
df_brief

Unnamed: 0,systemNumber,_images__primary_thumbnail,briefDescription,local_image_path
0,O22904,https://framemark.vam.ac.uk/collections/2006AW...,"Yixing ware. Teapot, brown stoneware in the fo...",downloaded_images\O22904.jpg
1,O187757,https://framemark.vam.ac.uk/collections/2010EB...,"Yixing ware. Dish, in the form of a peach with...",downloaded_images\O187757.jpg
2,O185533,https://framemark.vam.ac.uk/collections/2010EB...,"Yixing ware. Vase, brown stoneware with mottle...",downloaded_images\O185533.jpg
3,O162180,https://framemark.vam.ac.uk/collections/2014GY...,Salt-glazed stoneware tankard with pewter moun...,downloaded_images\O162180.jpg
4,O151257,https://framemark.vam.ac.uk/collections/2008BV...,,downloaded_images\O151257.jpg
...,...,...,...,...
1945,O157075,https://framemark.vam.ac.uk/collections/2009CP...,"Silver, parcel-gilt, London, (no hallmarks), c...",downloaded_images\O157075.jpg
1946,O148514,https://framemark.vam.ac.uk/collections/2008BT...,Red earthenware tyg with relief decoration in ...,downloaded_images\O148514.jpg
1947,O8072,https://framemark.vam.ac.uk/collections/2006AJ...,"Delftware mug, painted in blue with birds, flo...",downloaded_images\O8072.jpg
1948,O70493,https://framemark.vam.ac.uk/collections/2008BT...,"Tea bowl, Raku ware, Kyoto, attributed to Raku...",downloaded_images\O70493.jpg


In [20]:
# 使用训练好的CLIP模型
model = CLIPModel.from_pretrained(r"D:\_HuggingFace_\clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained(r"D:\_HuggingFace_\clip-vit-base-patch32")

In [16]:
# 编码文本
def encode_text(text, max_length=77):
    """
    嵌入到同一特征空间
    """    
    try:
        if not isinstance(text, str):
            raise ValueError("Text input must be a string.")

        text_inputs = processor(
            text=[text],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=max_length
        )
        
        with torch.no_grad():
            text_outputs = model.get_text_features(**text_inputs)
            text_features = text_outputs / text_outputs.norm(dim=-1, keepdim=True)
            text_features_np = text_features.squeeze().numpy()
        
        return text_features_np
    except Exception as e:
        print(f"Error processing text: {e}")
        return None

# 编码图像
def encode_image(image_path):
    """
    嵌入到同一特征空间
    """    
    try:
        if not os.path.isfile(image_path):
            raise FileNotFoundError(f"Image file not found: {image_path}")
        
        image = Image.open(image_path).convert("RGB")
        
        image_inputs = processor(images=[image], return_tensors="pt", padding=True)
        
        with torch.no_grad():
            image_outputs = model.get_image_features(**image_inputs)
            image_features = image_outputs / image_outputs.norm(dim=-1, keepdim=True)
            image_features_np = image_features.squeeze().numpy()
        
        return image_features_np
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

In [17]:
df_brief['text_features'] = None
df_brief['image_features'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
for idx, row in df_brief.iterrows():
    text = row['briefDescription']
    image_path = row['local_image_path']
    
    if pd.notna(text) and isinstance(text, str):
        text_features = encode_text(text)
        df_brief.at[idx, 'text_features'] = text_features
    else:
        1
        # print(f"Skipping text for row {idx} due to missing or invalid text.")

    if pd.notna(image_path) and os.path.isfile(image_path):
        image_features = encode_image(image_path)
        df_brief.at[idx, 'image_features'] = image_features
    else:
        1
        # print(f"Skipping image for row {idx} due to missing or invalid image path.")

In [22]:
# 导出数据
output_file_path = './datasets/exhibition_feature_offline.csv'
df_brief.to_csv(output_file_path, index=False)