In [1]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import matplotlib.pyplot as plt

In [2]:
# 调用大语言模型
class LLMClient:
    
    def __init__(self, host: str='36.137.79.97', port: int=30250):
        self.client = openai.Client(
            api_key="empty",
            base_url=f"http://{host}:{port}/v1")
        models = self.client.models.list()
        assert len(models.data) == 1
        self.model = models.data[0].id
        print("Use Model:", self.model)
    
    def chat(self, **kwargs):
        res = self.client.chat.completions.create(
            model=self.model, **kwargs)
        return [choice.message.content for choice in res.choices], dict(res.usage)

    def complete(self, **kwargs):
        res = self.client.completions.create(
            model=self.model, **kwargs)
        return [choice.text for choice in res.choices], dict(res.usage)

In [3]:
# 读取数据
df = pd.read_csv('./datasets/exhibition.csv')  

# 清理数据：删除包含缺失值的行
df_cleaned = df.dropna(subset=['briefDescription', 'physicalDescription', '_images__primary_thumbnail'])

In [4]:
# 载入 Sentence 模型
sbert_model_path = r'D:\_HuggingFace_\bge-large-en-v1.5'
model = SentenceTransformer(sbert_model_path)

In [5]:
def get_semantic_similarity(description: str, column_values: list):

    # 清理 column_values，确保它们都是字符串类型，并且忽略为空的值
    column_values = [str(val).strip() for val in column_values if str(val).strip() != '']
    
    if len(column_values) == 0:
        # 如果 column_values 为空，返回空列表，表示没有有效的数据进行相似度计算
        return []

    # 编码用户描述和列的每个值
    description_embedding = model.encode(description, convert_to_tensor=True)
    column_embeddings = model.encode(column_values, convert_to_tensor=True)

    # 计算语义相似度（余弦相似度）
    cosine_scores = util.pytorch_cos_sim(description_embedding, column_embeddings)[0]
    return cosine_scores

In [6]:
# 得到最相似的行
def get_most_similar_row(description: str, columns: list, df: pd.DataFrame, similarity_threshold=0.2):
    most_similar_row = {}

    for column in columns:
        if column in df.columns:  
            column_values = df[column].tolist()
            similarity_scores = get_semantic_similarity(description, column_values)

            # 检查相似度数组是否为空
            if len(similarity_scores) == 0:
                print(f"Warning: No valid similarity scores for column '{column}' (empty or invalid values).")
                continue  # 如果相似度计算失败，跳过该列

            # 如果相似度最大值低于阈值，跳过该列
            if similarity_scores.max() < similarity_threshold:
                print(f"Warning: All similarity scores for column '{column}' are below the threshold.")
                continue  # 跳过该列

            # 将 similarity_scores 从 GPU 转移到 CPU
            similarity_scores_cpu = similarity_scores.cpu() if similarity_scores.is_cuda else similarity_scores

            # 获取相似度最高的一行
            most_similar_index = similarity_scores_cpu.argmax()  # 找到相似度最高的索引

            # 将最相似的一行记录保存到结果字典中
            most_similar_row[column] = df.iloc[[most_similar_index]][['systemNumber', 'galleryLabels_0_text', 'briefDescription', 'physicalDescription', '_images__primary_thumbnail']]

    return most_similar_row

In [7]:
# 展示结果
def display_results(most_similar_row):
    for column, rows in most_similar_row.items():
        print(f"\nTop result for column: {column}")
        for idx, row in rows.iterrows():
            print(f"System Number: {row['systemNumber']}")
            print(f"Gallery Label: {row['galleryLabels_0_text']}")
            print(f"Brief Description: {row['briefDescription']}")
            print(f"Physical Description: {row['physicalDescription']}")
            
            # 获取并展示图片链接（假设图片超链接存在于 '_images__primary_thumbnail' 列）
            print(f"Image Link: {row['_images__primary_thumbnail']}")
            print("=" * 40)

In [8]:
def get_relevant_artifacts(df: pd.DataFrame):
    description = input("请输入文物的描述信息：")
    
    # 定义要检索的列
    columns_to_check = ['galleryLabels_0_text', 'briefDescription', 'physicalDescription']
    
    most_similar_row = get_most_similar_row(description, columns_to_check, df)
    
    display_results(most_similar_row)

get_relevant_artifacts(df_cleaned)

请输入文物的描述信息：Yixing ware. Dish, in the form of a peach with peach flower relief decoration, stoneware with white glaze, China, Qing dynasty, 18th century. Yixing ware. Vase, brown stoneware with mottled blue-green glaze, after the style of Song dynasty Jun ware, China, Qing dynasty, 18th century.

Top result for column: galleryLabels_0_text
System Number: O1726884
Gallery Label: 31.08 Vase; garden or flower pot decorated with dark brown slip; engine -turned and rouletted, applied swags and bows. White terra cotta stoneware with pearl glaze

c. 1785
Brief Description: Garden pot, white terra cotta stoneware decorated with dark brown slip and applied swags and bows, Josiah Wedgwood & sons, Etruria, ca. 1785
Physical Description: White terra cotta flower pot decorated with dark brown slip, engine-turned and rouletted, with applied swags and bows decoration
Image Link: https://framemark.vam.ac.uk/collections/2024NX4012/full/!100,100/0/default.jpg

Top result for column: briefDescription
Syst