# LAD Pipeline

In [None]:
from openai import OpenAI 
import pandas as pd
import base64
import requests
import json
import os
from enum import Enum
import re

## Stage I: Perception
* Image description
* Keywords

In [None]:
prompt1_1_zh = '''
请结合中文文化背景，用一段话描述图片。你应该考虑图片中的角色、文字、颜色和布局。重点关注文字和图片中的重要元素。尽量简洁，同时确保描述的准确性。
'''

prompt1_1_en = '''
Please provide a description of the image in a paragraph. You should consider the role, text, color, and layout of the image. Focus on the text and important elements in the image. Try to be concise while ensuring the correctness of the description.
'''

In [None]:
prompt1_2_zh = '''
请结合中文文化语境，根据图片描述提供七个与隐喻最相关的关键词。你应该考虑图片描述中的情感、领域和修辞手法。重点关注图片描述中的图片文字和重要实体。请注意，可能存在谐音梗和双关语，请识别并提供它们，但不要重复相同的元素。深呼吸并一步步思考，仅输出关键词。
图片描述：{}
'''

prompt1_2_en = '''
Please provide seven keywords most related to the metaphor based on the image description. You should consider the emotion, domain, and rhetoric in the image description. Focus on the image’s text and important entities in the image description. Note that there may be homophonic memes and puns, distinguish and provide them but do not repeat the same element. Take a deep breath and think step by step, only output the keywords.
Image description: {}
'''

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [None]:
def GPT_p1_1(prompt1,url):
    base64_image = encode_image(url)
    messages = [
        {"role": "user", "content": [
            {"type": "text", "text": prompt1},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}", "detail": "high"}}
        ]}
    ]
    
    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0.7, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    image_dep = response.json()["choices"][0]["message"]["content"]
    return image_dep

def GPT_p1_2(prompt2,image_dep):
    messages = [
            {"role": "user", "content": prompt2.format(image_dep)}
        ]

    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0.7, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    keywords = response.json()["choices"][0]["message"]["content"]
    return keywords

## Stage II：Search
* Construct search questions
* Self-judge
* ModelSearch
* WebSearch
* Rank
* Summary


In [None]:
prompt2_1_zh = '''
# Role
你是一位熟悉网络文化和梗的研究员，擅长挖掘和解析网络梗的深层含义。
## Attention
你负责基于关键词，设计出能够精确检索到梗文化或图片隐喻的检索问题。

## Skills
### Skill 1: 网络搜索技能
- 能够分析提供的关键词并选择最相关的关键词，理解其在隐喻文化中可能的含义和应用

### Skill 2: 文化分析
- 能够深入挖掘关键词在网络文化中的背景和历史，通过信息整理和分析，找出关键词背后的深层含义

### Skill 3: 隐喻理解
- 能够理解和分析关键词在图片中的隐喻意义，更好地理解梗图的文化背景

## Workflow:
1. 分析提供的关键词并选择最相关的关键词，理解其在隐喻文化中可能的含义和应用。
2. 设计具体且有针对性的检索问题，以提高搜索的精确度。
3. 确保检索问题能够引导用户找到与梗图相关的文化背景或隐喻解释。

## Constraints
- 检索问题需要围绕梗文化或图片隐喻，避免过于广泛或不相关的搜索结果
- 检索问题需要考虑关键词的组合，尽量避免单一关键词检索
- 检索问题总数为5个，请设计最相关的问题

## Example: 
输入：
"""
关键词：讽刺，环保，浪费，对比，空喊口号，行动缺失，言行不一致
"""
输出：
"""
1. "环保口号与实际行动不符的具体表现？"

2. "网络上流行的环保与浪费对比梗图的深刻内涵？"

3. "讽刺环保口号与行动缺失的双重信息图片？"

4. "环保主题中口号与实际行为矛盾的讽刺性对比图？"

5. "网络梗图如何展现环保口号与浪费行为的讽刺对比？"
"""

## Solve:
关键词：{}
'''

prompt2_1_en = '''
# Role
You are a researcher familiar with internet culture and memes, skilled at uncovering and analyzing the deeper meanings of internet memes.
## Attention
Your responsibility is to design precise search questions based on keywords that can accurately retrieve meme culture or image metaphors.

## Skills
### Skill 1: Internet Search Skills
- Analyze the provided keywords and select the most relevant ones, understanding their potential meanings and applications in metaphorical culture

### Skill 2: Cultural Analysis
- Deeply explore the background and history of keywords in internet culture, through information organization and analysis, to uncover the deeper meanings behind the keywords

### Skill 3: Metaphor Comprehension
- Understand and analyze the metaphorical meanings of keywords in images, to better comprehend the cultural background of meme images

## Workflow:
1. Analyze the provided keywords and select the most relevant ones, understanding their potential meanings and applications in metaphorical culture.
2. Design specific and targeted search questions to improve the accuracy of the search.
3. Ensure that the search questions can guide users to find cultural backgrounds or metaphorical explanations related to the meme images.

## Constraints
- Search questions should focus on meme culture or image metaphors, avoiding overly broad or irrelevant search results
- Search questions should consider keyword combinations, avoiding single keyword searches as much as possible
- The total number of search questions should be 5, please design the most relevant questions

## Example:
Input:
"""
Keywords: irony, environmental protection, waste, contrast, empty slogans, lack of action, inconsistency between words and deeds
"""
Output:
"""
1. "What are the specific manifestations of the inconsistency between environmental protection slogans and actual actions? "

2. "What is the profound connotation of the popular Internet pictures comparing environmental protection and waste? "

3. "Dual information pictures satirizing environmental protection slogans and lack of action? "

4. "Ironic contrast pictures of the contradiction between slogans and actual actions in environmental protection themes? "

5. "How do Internet pictures show the ironic contrast between environmental protection slogans and wasteful behavior? "
"""

## Solve:
keywords: {}
'''

In [None]:
prompt2_2_zh = '''
# Task
请自我评估知识水平，判断问题是否适合直接回答或需要外部知识支持，并根据评分准则提供置信度分数。

## Evaluation Standard
- 若问题中包含网络文化和梗文化，则置信度分数小于3
- 若问题中包含的知识流行度较高，则置信度分数小于3
- 若问题中包含的实时性内容占比多，则置信度分数小于3
- 若问题中包含的实体相对小众，则置信度分数小于3
- 若问题中包含的实体多于2个，则置信度分数小于3

## Workflow:
  1. <隐式>分析问题内容，识别是否包含网络文化和梗文化元素。
  2. <隐式>评估问题的流行度和实时性内容占比。
  3. 根据评分准则，<显示>提供置信度分数和决策。
  
## Constraints
- 置信度分数必须在1-5分，分数大于3时选择ModelSearch，分数小于等于3时选择WebSearch

## OutputFormat: 
[置信度分数(1-5分), 决策(ModelSearch/WebSearch)]

## Solve:
问题：{}
'''

prompt2_2_en = '''
# Task
Please evaluate your knowledge level, determine whether the question is suitable for direct answering or requires external knowledge support, and provide a confidence score according to the Evaluation Standard.

## Evaluation Standard
- If the question contains Internet culture and meme culture, the confidence score is less than 3
- If the knowledge contained in the question is highly popular, the confidence score is less than 3
- If the real-time content contained in the question accounts for a large proportion, the confidence score is less than 3
- If the entity contained in the question is relatively niche, the confidence score is less than 3
- If the question contains more than 2 entities, the confidence score is less than 3

## Workflow:
1. <Implicitly> Analyze the content of the question and identify whether it contains Internet culture and meme culture elements.
2. <Implicitly> Evaluate the popularity of the question and the proportion of real-time content.
3. According to the scoring criteria, <Explicitly> provide a confidence score and decision.

## Constraints
- The confidence score must be between 1 and 5. If the score is bigger than 3, select ModelSearch. If the score is smaller than or equal to 3, select WebSearch.

## OutputFormat:
[Confidence score (1-5 points), Decision (ModelSearch/WebSearch)]

## Solve:
Question: {}
'''

In [None]:
prompt2_3_zh = '''
# Role
你是一位熟悉网络文化和梗的研究员，擅长挖掘和解析网络梗的深层含义。
## Attention
你负责基于网络文化和梗文化的问题，提供关于网络梗和图片隐喻的清晰、专业、全面的解释，帮助用户理解其背后的文化含义和社会影响。

## Skills
### Skill 1: 隐喻理解
- 能够根据图片分析准确识别图片中涉及的梗或隐喻，并能提取深层含义

### Skill 2: 网络文化分析
- 能够识别经典影视角色、经典影视桥段并分析梗文化或图片隐喻的起源、发展和当前流行状态，让用户更好地理解梗图的文化背景

### Skill 3: 提供解释
- 能够为用户提供清晰、专业的解释，解释梗或图片隐喻背后的文化含义和社会影响
- 能够提供结构化的、易于理解的回答

## Workflow:
1. 确定用户提出的问题中涉及的梗或图片隐喻。
2. 分析梗或图片隐喻的起源、发展和当前流行状态。
3. 解释梗或图片隐喻背后的文化含义和社会影响。
4. 提供结构化的、易于理解的回答。

## Constraints
- 回答内容需要逻辑清晰，层次分明，确保读者易于理解
- 回答内容需要围绕梗文化或图片隐喻
- 回答部分需要全面且完备，不要出现"基于上述内容"等模糊表达，确保信息的可信度
- 语言风格需要专业、严谨，避免口语化表达
- 保持统一的语法和词汇使用，确保整体文档的一致性和连贯性

## Solve:
问题：{}
'''

prompt2_3_en = '''
# Role
You are a researcher familiar with internet culture and memes, skilled at uncovering and analyzing the deeper meanings of internet memes.
## Attention
You are responsible for providing clear, professional and comprehensive explanations of Internet memes and image metaphors based on Internet culture and meme culture, helping users understand the cultural meaning and social impact behind them.

## Skills
### Skill 1: Metaphor Understanding
- Ability to accurately identify the memes or metaphors involved in the picture based on picture analysis, and be able to extract the deep meaning

### Skill 2: Internet Culture Analysis
- Ability to analyze the origin, development and current popularity of memes or image metaphors, so that users can better understand the cultural background of memes

### Skill 3: Providing Explanations
- Ability to provide users with clear and professional explanations of the cultural meaning and social impact behind memes or image metaphors
- Ability to provide structured and easy-to-understand answers

## Workflow:
1. Identify the meme or picture metaphor involved in the question raised by the user.
2. Analyze the origin, development and current popularity of the meme or picture metaphor.
3. Explain the cultural meaning and social impact behind the meme or picture metaphor.
4. Provide a structured and easy-to-understand answer.

## Constraints
- The answer content needs to be logically clear and well-structured to ensure that readers can understand it easily
- The answer content needs to focus on the meme culture or picture metaphor
- The answer part needs to be comprehensive and complete, and there should be no vague expressions such as "based on the above content" to ensure the credibility of the information
- The language style needs to be professional and rigorous, and avoid colloquial expressions
- Maintain uniform grammar and vocabulary to ensure the consistency and coherence of the overall document

## Solve:
Question: {}
'''

In [None]:
prompt2_4_zh = '''
# Role
你是一位熟悉网络文化和梗的研究员，擅长从图像中提取深层含义，并将这些含义与相关的问题和总结进行匹配和排序。
## Attention
你负责根据图片描述，对多个问题-总结对进行相关性排序，并选择最相关的3个，帮助用户理解图片的深层含义。

## Skills
### Skill 1: 图片分析
- 能够根据用户对图片的描述进行全面分析

### Skill 2: 隐喻理解
- 能够根据图片分析准确识别图片中的隐喻，并能提取深层含义

### Skill 3: 相关性排序
- 能够根据图片的隐喻理解，针对问题-总结对的内容是否与图片隐喻相关，对问题-总结对进行相关性排序
- 能够提供对应相关性排序理由，并分析该问题-总结对中低相关内容

## Workflow:
  1. 仔细分析图片描述，<隐式>提取图片中的隐喻和关键元素。
  2. 对每个问题-总结对进行分析，<隐式>评估其与图片隐喻的相关性。
  3. 根据相关性大小，<隐式>对问题-总结对进行排序。
  4. <显示>输出排序后最相关的3个问题-总结对和排序理由。

## Constraints
- 排序应基于图片隐喻的相关性，确保排序结果的准确性和逻辑性
- 若问题-总结对中某部分与图片隐喻无关，请在对应理由中提供
- 仅输出排序后的最相关的3个问题-总结对和排序理由，避免使用markdown格式

## Examples:
输入：
"""
图片描述："一个人站在分岔路口"，问题-总结对："1. 问题："人生选择的重要性"; 2. 问题："面对困难时的决策过程"; 3. 问题："城市规划的复杂性"; 4. 问题："旅行中的导航技巧"; 5. 问题："个人成长与自我发现""
"""
输出：
"""
1. 问题：人生选择的重要性
  - 排序理由：xxx
2. 问题：面对困难时的决策过程
  - 排序理由：xxx
3. 问题：个人成长与自我发现
  - 排序理由：xxx
"""

## Solve:
图片描述：{}
问题-总结对：{}
'''

prompt2_4_en = '''
# Role
You are a researcher familiar with Internet culture and memes, and are good at extracting deep meanings from images, and matching and sorting these meanings with relevant questions and summaries.
## Attention
You are responsible for sorting the relevance of multiple question-summary pairs based on the image description, and selecting the three most relevant ones to help users understand the deep meaning of the image.

## Skills
### Skill 1: Image Analysis
- Ability to conduct a comprehensive analysis based on the user's description of the image

### Skill 2: Metaphor Understanding
- Ability to accurately identify metaphors in images based on image analysis and extract deep meaning

### Skill 3: Relevance Sorting
- Ability to sort the relevance of question-summary pairs based on the metaphorical understanding of the image, based on whether the content of the question-summary pair is related to the image metaphor
- Ability to provide reasons for the corresponding relevance sorting, and analyze the low-relevance content of the question-summary pair

## Workflow:
1. Carefully analyze the image description and <implicitly> extract the metaphors and key elements in the image.
2. Analyze each question-summary pair and <implicitly> evaluate its relevance to the image metaphor.
3. According to the relevance, <implicitly> sort the question-summary pairs.
4. <display> Output the top 3 most relevant question-summary pairs after sorting and the reasons for sorting.

## Constraints
- Sorting should be based on the relevance of the image metaphor to ensure the accuracy and logic of the sorting results
- If a part of the question-summary pair is not related to the image metaphor, please provide it in the corresponding reason
- Only output the top 3 most relevant question-summary pairs after sorting and the reasons for sorting, avoid using markdown format

## Examples:
Input:
"""
Image Description: "A person standing at a crossroads", Question-Summary Pair: "1. Question: "The importance of life choices"; 2. Question: "Decision-making process in the face of difficulties"; 3. Question: "Complexity of urban planning"; 4. Question: "Navigation skills in travel"; 5. Question: "Personal growth and self-discovery""
"""
Output:
"""
1. Question: The importance of life choices
  - Sorting Reason: xxx
2. Question: Decision-making process in the face of difficulties
  - Sorting Reason: xxx
3. Question: Personal growth and self-discovery
  - Sorting Reason: xxx
"""

## Solve:
Image Description: {}
Question-Summary Pair: {}
'''

In [None]:
prompt2_5_zh = '''
# Role
你是一位熟悉网络文化和梗的研究员，擅长根据相关性排序理由，对选定的“问题-总结对”进行改写，以提高内容的吸引力和信息的准确性。
## Attention
你负责根据相关性排序理由，对选定的“问题-总结对”进行改写，去掉重复语意、与网络文化和梗文化相关度低的部分，以及去除markdown和引用格式符，最终输出一篇融合并改写后的内容。

## Skills 
### Skill 1: 网络文化洞察
- 能够深入理解经典影视角色、经典影视桥段和梗文化，准确把握网络热点和流行趋势

### Skill 2: 内容分析与改写
- 能够对“问题-总结对”进行细致分析，识别与网络文化和梗文化的相关性
- 能够根据分析结果，对内容进行创造性改写，提高信息的准确性和吸引力

### Skill 3: 格式识别与处理
- 能够识别并去除markdown和引用格式符，确保输出内容的简洁性和易读性
- 能够整合改写后的内容，用适当的过渡语句连接，形成一篇连贯、流畅的文章

## Workflow:
1. 阅读并理解用户提供的相关性排序理由和选定的“问题-总结对”。
2. 根据排序理由，分析每个“问题-总结对”与网络文化和梗文化的关联度。
3. 对选定的3个“问题-总结对”进行改写，去掉重复语意、相关度低的部分，以及去除格式符。
4. 将改写后的内容进行整合，用适当的过渡语句连接，形成一篇连贯、流畅的文章。
  
## Constraints
- 改写后的内容应保持原意，不得改变原有的信息和观点
- 语言应简洁明了且易于理解，与图片隐喻相关
- 不得使用任何形式的markdown和引用格式符

## Solve:
相关性排序结果：{}
最相关的问题-总结对：{}
'''

prompt2_5_en = '''
# Role
You are a researcher familiar with Internet culture and memes, and are good at rewriting the selected "question-summary pairs" according to the relevance sorting reasons to improve the appeal of the content and the accuracy of the information.
## Attention
You are responsible for rewriting the selected "question-summary pairs" according to the relevance ranking reasons, removing repeated semantics, parts with low relevance to Internet culture and meme culture, and removing markdown and citation formatting symbols, and finally outputting a fused and rewritten content.
## Skills
### Skill 1: Internet Culture Insight
- Able to deeply understand classic film and television characters, classic film and television plots and meme culture, and accurately grasp network hot spots and popular trends

### Skill 2: Content Analysis and Rewriting
- Able to conduct detailed analysis of "question-summary pairs" and identify the relevance to Internet culture and meme culture
- Able to creatively rewrite content based on analysis results to improve the accuracy and attractiveness of information

### Skill 3: Format Recognition and Processing
- Able to identify and remove markdown and reference format symbols to ensure the simplicity and readability of output content
- Able to integrate rewritten content and connect it with appropriate transition sentences to form a coherent and fluent article

## Workflow:
1. Read and understand the relevance sorting results and all "question-summary pairs" provided by the user.
2. Based on the selection reasons, analyze the relevance of each "question-summary pair" to Internet culture and meme culture.
3. Rewrite the selected 3 "question-summary pairs", remove repeated semantics, low relevance parts, and format symbols.
4. Integrate the rewritten content, connect it with appropriate transition sentences to form a coherent and fluent article.

## Constraints
- The rewritten content should maintain the original meaning and should not change the original information and viewpoints
- The language should be concise, clear, easy to understand, and related to the image metaphor
- Do not use any form of markdown and reference format symbols

## Solve:
Relevance Sorting Results: {}
Most Relevant Question-Summary Pair: {}

'''

In [None]:
# SearchQuestions
def GPT_p2_1(prompt3,keywords):
    messages = [
            {"role": "user", "content": prompt3.format(keywords)}
        ]

    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0.7, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    search_questions = response.json()["choices"][0]["message"]["content"]
    return search_questions  

# Self-Judge
def GPT_P2_2(prompt4,search_question):    
    messages = [
            {"role": "user", "content": prompt4.format(search_question)}
        ]

    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0.0, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    score = response.json()["choices"][0]["message"]["content"]
    return score   

# ModelSearch
def GPT_P2_3(prompt5,search_question):    
    messages = [
            {"role": "user", "content": prompt5.format(search_question)}
        ]

    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0.5, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    search_result = response.json()["choices"][0]["message"]["content"]
    return search_result  

    

In [None]:
# WebSearch: GPTAPI
class AgentStatusCode(Enum):
    STREAM_ING = "STREAM_ING"
    ANSWER_ING = "ANSWER_ING"
    PLUGIN_START = "PLUGIN_START"
    PLUGIN_END = "PLUGIN_END"
    PLUGIN_RETURN = "PLUGIN_RETURN"
    END = "END"

def streaming(raw_response):
    for chunk in raw_response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
        if chunk:
            decoded = chunk.decode('utf-8')
            if decoded == '\r':
                continue
            if decoded[:6] == 'data: ':
                decoded = decoded[6:]
            elif decoded.startswith(': ping - '):
                continue
            response = json.loads(decoded)
            yield (response['response'], response['current_node'])

def process_response(agent_return, node_name):
    if not node_name or node_name in ['root', 'response']:
        result =  {
            'type': 'planner',
            'output': agent_return['response']
        }
        
    else:
        result = {
            'type': 'searcher',
            'name': node_name,
        }
    
    return result

def call_websearch(inputs, server_url='http://127.0.0.1:8002/solve', retries=3):
    headers = {'Content-Type': 'application/json'}
    data = {
        'inputs': inputs,
        'agent_cfg': {}
    }

    attempt = 0
    result = {
        'planner_output': '',
        'searcher_outputs': []
    }

    while attempt < retries:
        try:
            raw_response = requests.post(server_url, headers=headers, json=data, stream=True)
            raw_response.raise_for_status()

            current_searcher_output = None
            for resp in streaming(raw_response):
                agent_return, node_name = resp
                processed_result = process_response(agent_return, node_name)
                if processed_result['type'] == 'planner':
                    result['planner_output'] = processed_result['output']
                else:
                    if processed_result['type'] == 'searcher':
                        current_searcher_output = processed_result['output']
                        result['searcher_outputs'].append(current_searcher_output)
                        current_searcher_output = None

            return result
        except requests.exceptions.RequestException as e:
            print(f"调用 websearch API 时发生错误: {e}")
            attempt += 1
            print(f"第{attempt}次尝试重连...")
            if attempt == retries:
                print(f"已尝试3次重连，自动保存部分结果到文件")

    return result

In [None]:
def GPT_P2_4(prompt6,image_dep,search_result_all):    
    messages = [
            {"role": "user", "content": prompt6.format(image_dep, search_result_all)}
        ]

    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    rank_search_result = response.json()["choices"][0]["message"]["content"]
    return rank_search_result

def GPT_P2_5(prompt7,rank_search_result,search_result_all):    
    messages = [
            {"role": "user", "content": prompt7.format(rank_search_result, search_result_all)}
        ]
    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0.5, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    overall_summary = response.json()["choices"][0]["message"]["content"]
    return overall_summary

In [None]:
# 拆分检索问题
def split_search_questions(search_questions):
    # 按换行符拆分成单独的行
    questions_lines = [line for line in search_questions.split('\n') if line.strip()]

    search_question_list = []
    # 遍历每行，去除编号和多余的引号，并添加到 search_question_list 中
    for line in questions_lines:
        # 去除编号和多余的引号
        parts = line.split('. ', 1)
        if parts != ['']:
            if len(parts) > 1:
                question = parts[1].replace('""', '"').strip()
            else:
                question = parts[0].replace('""', '"').strip()
            # 添加到 search_question_list 中
            search_question_list.append(question) 
    return search_question_list  # 返回拆分后的检索问题列表,还需拆分为单个问题      

In [None]:
# 提取得分，选择search策略(ModelSearch / WebSearch)  
def extract_score(score): 
    match = re.search(r'\[(\d),\s*(WebSearch|ModelSearch)\]', score)  
    score_value = 0  
    decision = 'ModelSearch'
    if match:
        score_value = int(match.group(1))   # 置信度分数
        decision = match.group(2)   # 决策
    return score_value, decision


In [None]:
def WebSearch(search_question):
    attempt = 0
    while attempt < 3:
        print("检索中...预计1-2分钟，请稍等...")
        result_all = call_websearch(search_question)
        result = result_all['planner_output']
        if "```python" in result:
            attempt += 1
            print(f"识别到内容错误，尝试第 {attempt} 次重新请求...")
        else:
            break
    return result

In [None]:
# 选择搜索策略，调用WebSearch或ModelSearch
def choose_search_strategy(score_value, decision, prompt2_3_zh, search_question, i):
    if score_value <= 3 and decision == "WebSearch":
        print(f"第{i}个问题的置信度分数为：{score_value}，决策为：{decision}，需要进行WebSearch")
        search_result = WebSearch(search_question)
    elif score_value > 3 and decision == "ModelSearch":
        print(f"第{i}个问题的置信度分数为：{score_value}，决策为：{decision}，需要进行ModelSearch")
        search_result = GPT_P2_3(prompt2_3_zh, search_question)  # 调取ModelSearch
    else:
        print(f"第{i}个问题的置信度分数为：{score_value}，决策为：{decision}，决策不符合预期")
        # 默认为ModelSearch
        search_result = GPT_P2_3(prompt2_3_zh, search_question)  # 调取ModelSearch
    return search_result   

In [None]:
def search_result_top_zh(rank_search_result, search_result_all):
    def extract_top_questions(text):
        # 提取问题文本,去除编号和排序理由
        pattern = r'问题：(.*?)(?=\n|$)'
        matches = re.findall(pattern, text)
        questions = [re.sub(r'\*\*', '', m).strip() for m in matches]
        return questions[:3]  # 只返回前3个问题
    
    def find_answer(question, all_text):
        # 精确匹配问题和答案
        # 将问题中的特殊字符转义
        escaped_q = re.escape(question)
        pattern = f'问题[0-9]?："{escaped_q}"\\n回答："""(.*?)"""'
        match = re.search(pattern, all_text, re.DOTALL)
        return match.group(1).strip() if match else ''
    
    result_list = []
    # 获取前3个问题
    questions = extract_top_questions(rank_search_result)
    
    # 匹配答案并格式化输出
    for idx, q in enumerate(questions, 1):
        ans = find_answer(q, search_result_all)
        result_list.append(f'{idx}. 问题："{q}"\n回答："""{ans}"""\n')
    
    result = ''.join(result_list)
    return result

In [None]:
def search_result_top_en(rank_search_result, search_result_all):
    def extract_top_questions(text):
        # Extract questions from ranked results
        pattern = r'Question: (.*?)(?=\n|$)'
        matches = re.findall(pattern, text)
        questions = [re.sub(r'\*\*', '', m).strip() for m in matches]
        return questions[:3]
    
    def find_answer(question, all_text):
        # Find matching answer for each question
        escaped_q = re.escape(question)
        pattern = f'Question [0-9]?: "{escaped_q}"\\nAnswer: """(.*?)"""'
        match = re.search(pattern, all_text, re.DOTALL)
        return match.group(1).strip() if match else ''
    
    result_list = []
    questions = extract_top_questions(rank_search_result)
    
    for idx, q in enumerate(questions, 1):
        ans = find_answer(q, search_result_all)
        result_list.append(f'{idx}. Question: "{q}"\nAnswer: """{ans}"""\n')
    
    result = ''.join(result_list)
    return result

## Stage III：Reasoning
* Reasoning format
* Explicit reasoning

In [None]:
prompt3_zh = '''
图片描述：{}；
关键词：{}；
关键词描述：{}；
请结合以上图片、图片关键词和描述信息，尽可能分析理解图文结合的深层含义。无需描述图文，仅回答图片隐喻。请保证回答的准确性并尽量简洁。
'''

prompt3_en = '''
Image description: {}; 
Keywords: {}; 
Keywords description: {};
Please combine the image, keywords, and description information and try to understand the deep meaning of the combination of the image and text. \
No need to describe images and text, only answer metaphors. Ensure the accuracy of the answer and try to be concise as much as possible.
'''

In [None]:
prompt4_reasoning_zh = '''
图片描述：{}；
关键词：{}；
关键词描述：{}；
请结合以上图片、图片关键词和描述信息，尽可能分析理解图文结合的深层含义，回答以下单选题。直接回答正确选项 $LETTER。\
输出思考过程在 <think> </think> 标签中和最终正确答案在 <answer> </answer> 标签中。\
输出格式：<think>...</think> <answer>...</answer> \

单选题：{}
答案：
'''

prompt4_reasoning_en = '''
Image description: {}; 
Keywords: {}; 
Keywords description: {};
Please combine the image, keywords, and description information and try to understand the deep meaning of the combination of the image and text. \
Answer the following multiple-choice questions with the correct option $LETTER directly. \
Output the thinking process in <think> </think> and final correct answer in <answer> </answer> tags. \
The output format should be as follows: <think>...</think> <answer>...</answer> \

Multiple-choice questions: {}
Answer:
'''

In [None]:
# 单选题
prompt_MCQ_zh = '''
问题：{}
选项：{}
'''

prompt_MCQ_en = '''
Question: {}
Options: {}
'''

In [None]:
# 问答题
prompt_OSQ = '''
# Role
You are an impartial judge who is familiar with Internet culture and memes, and is good at digging out and analyzing the deep meaning of Internet memes.

## Attention
You are responsible for evaluating the quality of the answer provided by the model for Internet culture and memes. Your evaluation should refer to the human answer and image, and score based on the Evaluation Standard.

## Evaluation Standard:
[1 point]: Fails to capture key elements within the image (such as text, and important entities). Does not identify emotions, domain, or rhetorical devices. Only provides a superficial description of surface-level information, lacking depth and creativity, with a significant gap from the standard answer.

[2 points]: Captures some key elements within the image, but the identification of emotions, domain, and rhetorical devices is vague. The description of surface-level information is relatively complete, but there is a clear deficiency in exploring deeper meanings, showing a noticeable gap from the standard answer.

[3 points]: Effectively captures key elements within the image and initially identifies emotions, domain, and rhetorical devices. The description of surface-level information is relatively accurate, and there is some relevant expression of deep meanings. However, there is still room for improvement in depth and creativity, and it is generally close to the standard answer.

[4 points]: Accurately captures key elements within the image and clearly identifies emotions, domain, and rhetorical devices. The description of surface-level information is detailed and precise, with a relatively deep exploration of deep meanings, demonstrating a certain level of creativity and depth. It is largely consistent with the standard answer but may have minor deficiencies in some details or depth.

[5 points]: Accurately and precisely captures key elements within the image and profoundly identifies emotions, domain, and rhetorical devices. The description of surface-level information is comprehensive and precise, with unique insights into deep meanings, skillfully integrating image elements with metaphorical implications. It demonstrates exceptional creativity and depth, is highly consistent with the standard answer, and shows a profound grasp of metaphor creation and cultural understanding.

## Standrad Answer:
Human answer: {}

## Constraints
- Avoid any position biases and be as objective as possible
- Do not allow the length of the descriptions to influence your evaluation
- Output your final directly by strictly following this format: "[ratings]"

## Solve:
Model answer: {}
'''

In [None]:
# 生成图片隐喻 p1 + p2 + p3
def GPT_P3(url,prompt8,image_dep,keywords,overall_summary):    
    base64_image = encode_image(url)
    messages = [
        {"role": "user", "content": [
            {"type": "text", "text": prompt8.format(image_dep, keywords, overall_summary)},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}", "detail": "high"}}
        ]}
    ]
    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0.7, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    image_implication = response.json()["choices"][0]["message"]["content"]
    return image_implication

In [None]:
# 回答单选题 p1 + p2 + p3
def GPT_P4(url,prompt9,image_dep,keywords,overall_summary,multiple_questions):    
    base64_image = encode_image(url)
    messages = [
        {"role": "user", "content": [
            {"type": "text", "text": prompt9.format(image_dep, keywords, overall_summary, multiple_questions)},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}", "detail": "high"}}
        ]}
    ]
    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-mini', 'messages': messages, 'temperature': 0.5, 'top_p': 0.9}
    response = requests.post(proxy_api_url, headers=headers, json=data)
    MCQ_answer = response.json()["choices"][0]["message"]["content"]
    return MCQ_answer

In [None]:
def construct_multiple_questions(prompt10, question_data):
    question = question_data['question']
    options = question_data['options']
    multiple_options = ''
    option_labels = ['A', 'B', 'C', 'D', 'E', 'F']
    for label, option in zip(option_labels, options):
        multiple_options += f'{label}. {option}\n'
    multiple_questions = prompt10.format(question, multiple_options)
    return multiple_questions

In [None]:
# 回答问答题 p1 + p2 + p3
def GPT_P5_EN(url,prompt11,explanation,image_implication):
    base64_image = encode_image(url)
    messages = [
        {"role": "user", "content": [
            {"type": "text", "text": prompt11.format(explanation,image_implication)},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}", "detail": "high"}}
        ]}
    ]
    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-2024-11-20', 'messages': messages, 'temperature': 0, 'top_p': 0.9}

    response = requests.post(proxy_api_url, headers=headers, json=data)
    score = response.json()["choices"][0]["message"]["content"]
    return score

def GPT_P5_ZH(url,prompt11,metaphorical_meaning,explanation,image_implication):
    base64_image = encode_image(url)
    human_answer = metaphorical_meaning + ';' + explanation
    messages = [
        {"role": "user", "content": [
            {"type": "text", "text": prompt11.format(human_answer,image_implication)},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}", "detail": "high"}}
        ]}
    ]
    api_key = ""
    proxy_api_url = ''
    headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', }
    data = {'model': 'gpt-4o-2024-11-20', 'messages': messages, 'temperature': 0, 'top_p': 0.9}

    response = requests.post(proxy_api_url, headers=headers, json=data)
    score = response.json()["choices"][0]["message"]["content"]
    return score

## Experiments
* 从数据集读入图片(中文和英文图片)
* 生成图片描述(GPT_p1_1)
* 生成图片关键词(GPT_p1_2)
* 生成检索问题(GPT_p2_1) + 拆分检索问题(split_search_questions)
* 检索问题 self-judge(GPT_p2_2) + 提取得分，选择search策略(choose_search_strategy)
* ModelSearch(GPT_p2_3) / WebSearch(MindSearch) (choose_search_strategy)
* 检索内容排序(GPT_p2_4)
* 内容改写补充(GPT_p2_5)
* 图片隐喻生成(GPT_p3)
* 回答MCQ单选题(GPT_p4)
* 回答OSQ问答题(GPT_p5)

In [None]:
# LAD Pipeline (perception + search + reasoning)
import json
import os
from tqdm.notebook import tqdm


with open('dataset/II-Bench.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 检查结果文件是否存在，如果存在则读取已有数据
if os.path.exists('experiment/en/4o_mini_en_no_multiturn_ii.json'):
    with open('experiment/en/4o_mini_en_no_multiturn_ii.json', 'r', encoding='utf-8') as f:
        results = json.load(f)
else:
    results = []

# 遍历所有数据并显示进度条
for idx, item in enumerate(tqdm(data, desc="图片识别中")):
    try:
        url = item['local_path']
        # image_dep = GPT_p1_1(prompt1_1_zh, url)
        image_dep = GPT_p1_1(prompt1_1_en, url)
        print("image_dep: ", image_dep)
        # keywords = GPT_p1_2(prompt1_2_zh, image_dep)
        keywords = GPT_p1_2(prompt1_2_en, image_dep)
        print("keywords: ", keywords)
        print("\n --------------------------------------------------- \n")

        # search_questions_all = GPT_p2_1(prompt2_1_zh, keywords)
        search_questions_all = GPT_p2_1(prompt2_1_en, keywords)
        search_questions = split_search_questions(search_questions_all)
        search_result_all = ''
        for i, search_question in enumerate(search_questions):
            # score = GPT_P2_2(prompt2_2_zh, search_question)
            score = GPT_P2_2(prompt2_2_en, search_question)
            score_value, decision = extract_score(score)
            # search_result = choose_search_strategy(score_value, decision, prompt2_3_zh, search_question, i)
            search_result = choose_search_strategy(score_value, decision, prompt2_3_en, search_question, i)
            # search_result_single = f'问题{i}：{search_question}\n回答："""{search_result}"""\n'
            search_result_single = f'Question {i}: {search_question}\nAnswer: """{search_result}"""\n'
            print("search_result_single: ", search_result_single)
            search_result_all += search_result_single

        # rank_search_result = GPT_P2_4(prompt2_4_zh, image_dep, search_result_all)
        rank_search_result = GPT_P2_4(prompt2_4_en, image_dep, search_result_all)
        print("rank_search_result: ", rank_search_result)
        
        search_result_top = search_result_top_en(rank_search_result, search_result_all)
        # search_result_top = search_result_top_zh(rank_search_result, search_result_all)
        print("search_result_top: ", search_result_top)
        
        if search_result_top == '':
            print("search_result_top is empty")
            search_result_top = search_result_all
            
        # overall_summary = GPT_P2_5(prompt2_5_zh, rank_search_result, search_result_top)
        overall_summary = GPT_P2_5(prompt2_5_en, rank_search_result, search_result_top)
        print("overall_summary: ", overall_summary)
        # image_implication = GPT_P3(url, prompt3_zh, image_dep, keywords, overall_summary)
        image_implication = GPT_P3(url, prompt3_en, image_dep, keywords, overall_summary)
        print("image_implication: ", image_implication)
        print("\n --------------------------------------------------- \n")

        question_data = item['questions'][0]
        # multiple_questions = construct_multiple_questions(prompt_MCQ_zh, question_data)
        multiple_questions = construct_multiple_questions(prompt_MCQ_en, question_data)
        # MCQ_answer = GPT_P4(url, prompt4_reasoning_en, image_dep, keywords, overall_summary, multiple_questions)
        MCQ_answer = GPT_P4(url, prompt4_reasoning_en, image_dep, keywords, overall_summary, multiple_questions)
        print("MCQ_answer: ", MCQ_answer)
        print("\n --------------------------------------------------- \n")

        explanation = item['meta_data']['explanation']
        # metaphorical_meaning = item['meta_data']['metaphorical_meaning']
        # print(metaphorical_meaning + ';' + explanation)
        # OSQ_answer = GPT_P5_ZH(url, prompt_OSQ, metaphorical_meaning, explanation, image_implication)
        OSQ_answer = GPT_P5_EN(url, prompt_OSQ, explanation, image_implication)
        print("OSQ_answer: ", OSQ_answer)
        print("\n --------------------------------------------------- \n")

        # 组织结果
        result = {
            'id': idx,
            'url': url,
            'image_dep': image_dep,
            'keywords': keywords,
            'search_questions': search_questions,
            'search_result_all': search_result_all,
            'rank_search_result': rank_search_result,
            'search_result_top': search_result_top,
            'overall_summary': overall_summary,
            'image_implication': image_implication,
            'multiple_questions': multiple_questions,
            'MCQ_answer': MCQ_answer,
            'OSQ_answer': OSQ_answer,
        }

        # 将结果添加到列表
        results.append(result)

        # 将更新后的结果写入 JSON 文件
        with open('experiment/en/4o_mini_en_no_multiturn_ii.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=4)

    except Exception as e:
        # 处理异常情况
        print(f'处理图片 {url} 时出错：{e}')
        continue

## Results 

In [None]:
# MCQ
import json

with open('dataset/II-Bench.json', 'r', encoding='utf-8') as f:
    cii_data = json.load(f)

with open('experiment/en/4o_mini_en_no_multiturn_ii.json', 'r', encoding='utf-8') as f:
    test = json.load(f)

# 建立url和对应的正确答案的映射
test_dict = {}
for item in test:
    url = item['url']
    pattern = r'<answer>\s*([A-Za-z])' # reasoning format
    match = re.search(pattern, item['MCQ_answer'])
    if match:
        MCQ_answer = match.group(1)
    else:
        MCQ_answer = None
    test_dict[url] = MCQ_answer

# 比较答案并计算正确率
total_questions = 0
correct_count = 0
result_lines = []

for item in cii_data:
    local_path = item['local_path']
    questions = item['questions']
    for question in questions:
        total_questions += 1
        answer = question['answer'].strip()
        question_id = question['id']
        if local_path in test_dict:
            MCQ_answer = test_dict[local_path]
            if answer == MCQ_answer:
                correct_count += 1
                result = f"图片：{local_path}，题目ID：{question_id}，答案一致：{answer}"
            else:
                result = f"图片：{local_path}，题目ID：{question_id}，答案不一致，标准答案：{answer}，测试答案：{MCQ_answer}"
        else:
            result = f"图片：{local_path}在文件中未找到对应条目。"
            print(f"报错：{question_id}未找到")
            total_questions -= 1
        result_lines.append(result)

accuracy = correct_count / total_questions * 100
result_lines.append(f"\n总题目数：{total_questions}，正确数量：{correct_count}，正确率：{accuracy:.2f}%")
print(f"总题目数：{total_questions}，正确数量：{correct_count}，正确率：{accuracy:.2f}%")

# 将结果保存到answer.txt文件中
with open('results/en/MCQ/answer_4o_mini_en_no_multiturn_ii.txt', 'w', encoding='utf-8') as f:
    for line in result_lines:
        f.write(line + '\n')
    print("结果已保存")    

In [None]:
# OSQ
import re
import json

# 读取JSON文件
with open('experiment/en/4o_mini_en_no_multiturn_ii.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

count = 0
total_score = 0

# 遍历每个实例，累加分数并计数
for instance in data:
    count = count + 1
    score_str = instance.get('OSQ_answer', '[]')
    score_match = re.search(r'\d+', score_str)
    score = int(score_match.group()) if score_match else 0
    total_score += score

# 计算总体平均分数
average_score = total_score / count
print(f'总体平均分数: {average_score:.2f}, 共{count}个实例')

summary = f"总体平均分数: {average_score:.2f}, 共{count}个实例"

# 将结果保存到answer.txt文件中
with open('results/en/OSQ/answer_4o_mini_en_no_multiturn_ii.txt', 'w', encoding='utf-8') as f:
    json.dump(summary, f, ensure_ascii=False, indent=4)
    print("结果已保存")   