In [2]:

tools = [
  {
    "type": "function",
    "function": {
      "name": "find_similar_docs",
      "description": "This is a Text-to-Text retrieval system that retrieves multimodal reference documents by first identifying semantically relevant texts for a set of input queries. It uses a vector (embedding) index to perform semantic similarity retrieval. Specifically, for each query, the tool retrieves `n` reference txext documents (where `n` is defined by the `top_n` parameter). Then, all documents retrieved from all queries are merged, and the top `n` reference documents are selected. The merging rules are as follows: 1) If a document is retrieved by multiple queries, it is given higher priority and ranks higher; 2) If documents have the same priority, they are sorted by similarity score.The retrieval tool is built on a multimodal corpus, where each text document is pre-associated with its corresponding multimodal content (images, image captions). Therefore, when relevant texts are retrieved, their linked multimodal documents are also obtained automatically as references. ",
      "parameters": {
        "type": "object",
        "properties": {
          "query": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "description": "A list of input text queries, where each element is a string."
          },
          "doc_type": {
            "type": "string",
            "description": "The type of documents to retrieve. Valid options are 'arxiv','recipe','web','wiki' or 'wit'",
            "enum": ["arxiv","recipe","web","wiki","wit"]
          },
          "top_n": {
            "type": "integer",
            "description": "The number of reference documents to retain after merging. Multiple retrievers share a common quota. When the sum of their top_n values exceeds the total quota, each retriever is allocated a portion of the quota proportionally based on its top_n value.Currently, the total quota is fixed at 6. ",
            "default": 3
          }
        },
        "required": ["query", "doc_type", "top_n"]
      }
    }
  },
  {
    "type": "function",
    "function": {
      "name": "find_similar_image_by_query",
      "description": "This is a Text-to-Image retrieval tool that retrieves multimodal reference documents by first identifying semantically relevant images for a set of input text queries. It uses a vector (embedding) index to perform semantic similarity retrieval. Specifically, for each query, the tool retrieves `n` reference images (where `n` is defined by the `top_n` parameter). Then, all images retrieved from all query are merged, and the top `n` reference images are selected. The merging rules are as follows: 1) If a document is retrieved by multiple images, it is given higher priority and ranks higher; 2) If documents have the same priority, they are sorted by similarity score.Each retrieved image is part of a multimodal document,once an image is selected, the tools identifies its source document and extracts the surrounding textual context to construct a multimodal reference document.",
      "parameters": {
        "type": "object",
        "properties": {
          "query": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "description": "A list of input text queries, where each element is a string."
          },
          "doc_type": {
            "type": "string",
            "description": "The type of documents to retrieve. Valid options are 'arxiv','recipe','web','wiki' or 'wit'",
            "enum": ["arxiv","recipe","web","wiki","wit"]
          },
          "top_n": {
            "type": "integer",
            "description": "The number of reference documents to retain after merging. Multiple retrievers share a common quota. When the sum of their top_n values exceeds the total quota, each retriever is allocated a portion of the quota proportionally based on its top_n value.Currently, the total quota is fixed at 6.",
            "default": 3
          }
        },
        "required": ["images", "doc_type", "top_n"]
      }
    }
  }
]

In [3]:
import json
import os
hop_data=[]
file_path = "/root/autodl-fs/OURMRAG/data/base_retrieval/mramg_train_0caption.jsonl"
out_file_path="/root/autodl-fs/OURMRAG/data/stage_2/retrieval_grpo_0captipn.jsonl"
with open(file_path, 'r') as infile:
    for line in infile:
        item=json.loads(line.strip())
        hop_data.append(item)
def map_images_to_indices(find_images, all_images_list):
    # 构建图片名到索引的映射字典
    image_to_index = {img: idx for idx, img in enumerate(all_images_list)}

    # 映射每组图片为索引
    index_result = []
    for image_group in find_images:
        index_group = [image_to_index[img] for img in image_group]
        index_result.append(index_group)

    return index_result

In [4]:
def read_all_json_files(directory):
    data_list = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                data_list.append(data)
    return data_list
# 用法示例
directory_path = '/root/autodl-fs/IMAGE/images_info'
all_image_info = read_all_json_files(directory_path)
image2format = {}
for item in all_image_info:
    for key in item:
        image2format[str(key)] = item[key]['image_path']

In [5]:
prefix_general = "/root/autodl-fs/IMAGE/images/"
def genQuestion(item):
    print(list(item))
    qa_image_index=map_images_to_indices(item['query_find_images'],item['all_images_list'])
    image_image_index=map_images_to_indices(item['image_find_images'],item['all_images_list'])
    image_caption_report = "".join([f"\nImage [{i+1}] 's caption : {caption}\n" for i, caption in enumerate(item['all_caption_list'])])
    qa_report = ""
    for i, (report, image_indices) in enumerate(zip(item['query_find_text'], qa_image_index)):
        # 转换为“第 N 张图”的形式（索引从 1 开始）
        image_str = ", ".join([f"Image [{idx + 1}]" for idx in image_indices])
        qa_report += f"Reference Document [{i + 1}]: {report}\n  The illustration of the Reference Document [{i+1}] : {image_str}\n\n"
    
    image_report=""
    for i, (report, image_indices) in enumerate(zip(item['image_find_text'], image_image_index)):
        # 转换为“第 N 张图”的形式（索引从 1 开始）
        image_str = ", ".join([f"Image [{idx + 1}]" for idx in image_indices])
        image_report += f"Reference Document [{i + 1}]: {report}\n   The illustration of the Reference Document [{i+1}] : {image_str}\n\n"
    # print(qa_report)
    # print(image_report)
    # print(all_abs_list)
    # return
    genQprompt = f'''You are asked to optimize the retrieval strategy for a Multimodal Retrieval-Augmented Generation (MRAG) task. Specifically, given a question , you need to analyze the inputs comprehensively and retrieve relevant multimodal reference documents to effectively answer the question.

You will receive the following input information:

- **Question**: {item['question']}

- **Available Retrieval Tools**: {tools}

- **Multimodal reference documents retrieved by basic retrieval strategy.**

### Basic Retrieval Strategy:
1. **Text-to-Text Retrieval**:
   - Query list: includes only the original question.
   - Tool: Text-to-Text retriever (`find_similar_docs`), retrieves 3 relevant documents (`top_n=3`) from specified document type and then obtains their associated multimodal reference documents.

2. **Text-to-Image Retrieval**:
   - Query list:  includes only the original question.
   - Tool: Text-to-Image retriever (`find_similar_image_by_query`), retrieves 3 relevant images (`top_n=3`) from specified document type then obtains their associated multimodal reference documents.

### Current Retrieval Results:
- **Multimodal documents retrieved by Text-to-Text retriever**: {qa_report}
- **Multimodal documents retrieved by Text-to-Image retriever**: {image_report}

We also provide detailed captions of the retrieved images to support your retrieval analysis:{image_caption_report}

Please follow these analytical steps to optimize the retrieval strategy:

### Analysis and Decision Steps:

1. **First, attempt to answer the question directly using your internal knowledge.**

   * If the question can be **reasonably and confidently answered** without any retrieval — for example, if it involves commonsense facts, general world knowledge, or widely known topics — you **must immediately choose** the decision type `"no_search_needed"` and **stop further analysis**.
   * Do **not proceed to Step 2 or beyond** if Step 1 results in a sufficient answer. Early exit is required.
   * Retrieval (textual or visual) should only be considered if you **cannot** answer with confidence, or if your answer would be **incomplete or unreliable**.

2. **Only if Step 1 fails**, proceed to evaluate the retrieved multimodal documents:

   * If the retrieved documents are sufficient to clearly answer the question, set the decision type to `"reference_sufficient"`.

3. **If the retrieved documents are also insufficient**, then:

   * Clearly explain what specific information is missing.
   * Set the decision type to `"further_search_required"` and design a revised retrieval strategy, which may include:

     * Decomposing or rephrasing the question to target specific aspects.
     * Adjusting `top_n` to optimize document scope.
     * Choosing between or combining Text-to-Text and Text-to-Image retrieval based on the query type.

4. Always aim for **efficiency and sufficiency**. If at any point a confident answer is available, **stop there** and finalize the decision.

### Note:
- If you decide "no_search_needed" or "reference_sufficient", set the "tool_uses" field to "null".

Provide your final response strictly in the following format:

First, clearly present your detailed analytical thought process within `<think>` tags. Then, output your structured JSON within `<answer>` tags, ensuring the JSON is correctly parsable by `json.loads`:

<think>
    Clearly describe your analytical thought process here...
</think>
<answer>
    ```json
    {{
        "type": "your_decision_type",
        "tool_uses": [
            {{
                "recipient_name": "function_name",
                "parameters": {{
                    "parameter1": "value1",
                    "parameter2": "value2"
                }}
            }}
        ]
    }}
    ```
</answer>
    ''' 
    return genQprompt

In [6]:
hop_data[0]

{'question_id': 'ARXIV_15',
 'dataset': 'ARXIV',
 'type': 'train',
 'question': 'How does the DriveDreamer framework utilize text prompts to adjust the driving scenario style?',
 'answer': 'In the DriveDreamer framework, text prompts are used to dynamically adjust the style of the driving scenarios, such as changing the weather or time of day. These prompts serve as additional inputs that guide the system in generating visual content that matches the specified conditions.<PIC>',
 'images_list': ['../ARXIV/2309_09777v2_2.png'],
 'ref_chunks': ['60007_2'],
 'image_find_chunks': ['60007_1', '60007_3', '60007_2'],
 'query_find_chunks': ['60007_3', '60006_2', '60006_1'],
 'ref_text': ['These models have re-cently garnered significant attention due to their exceptionalperformance in various applications, setting new bench-marks in image synthesis [1, 14, 49, 55, 57], video gener-ation [21, 23, 35, 60, 67, 74], and 3D content generation[6, 43, 53, 69]. To enhance the controllable generation c

In [7]:
cnt=0
new_data=[]
for data in hop_data:
    try:
        new_input=genQuestion(data)
        img_abs_path_prefix = prefix_general + data['dataset'].upper() + '/'
        all_abs_list=[img_abs_path_prefix+image2format[str(img)] for img in data['all_images_list']]
        data['all_images_path']=all_abs_list
        data['messages']=[
                {
                    "role": "user",
                    "content": new_input
                }
            ]
        data.pop('ref_text')
        data.pop('ref_images')
        data.pop('image_find_text')
        data.pop('image_find_images')
        data.pop('query_find_text')
        data.pop('query_find_images')
        data.pop('all_images_list')
        data.pop('all_caption_list')
        new_data.append(data)
    except:
        cnt+=1
        continue
    # print(data['question_id'])
    
print(cnt)
with open(out_file_path, 'w', encoding='utf-8') as f:
    for item in new_data:
        f.write(json.dumps(item,ensure_ascii=False) + "\n")
    


['question_id', 'dataset', 'type', 'question', 'answer', 'images_list', 'ref_chunks', 'image_find_chunks', 'query_find_chunks', 'ref_text', 'ref_images', 'image_find_text', 'image_find_images', 'query_find_text', 'query_find_images', 'all_images_list', 'all_caption_list']
['question_id', 'dataset', 'type', 'question', 'answer', 'images_list', 'ref_chunks', 'image_find_chunks', 'query_find_chunks', 'ref_text', 'ref_images', 'image_find_text', 'image_find_images', 'query_find_text', 'query_find_images', 'all_images_list', 'all_caption_list']
['question_id', 'dataset', 'type', 'question', 'answer', 'images_list', 'ref_chunks', 'image_find_chunks', 'query_find_chunks', 'ref_text', 'ref_images', 'image_find_text', 'image_find_images', 'query_find_text', 'query_find_images', 'all_images_list', 'all_caption_list']
['question_id', 'dataset', 'type', 'question', 'answer', 'images_list', 'ref_chunks', 'image_find_chunks', 'query_find_chunks', 'ref_text', 'ref_images', 'image_find_text', 'image_f

In [8]:
print(item['messages'][0]['content'])

You are asked to optimize the retrieval strategy for a Multimodal Retrieval-Augmented Generation (MRAG) task. Specifically, given a question , you need to analyze the inputs comprehensively and retrieve relevant multimodal reference documents to effectively answer the question.

You will receive the following input information:

- **Question**: Whom did Scott Blumstein defeat in the final match of the 2017 World Series of Poker Main Event?

- **Available Retrieval Tools**: [{'type': 'function', 'function': {'name': 'find_similar_docs', 'description': 'This is a Text-to-Text retrieval system that retrieves multimodal reference documents by first identifying semantically relevant texts for a set of input queries. It uses a vector (embedding) index to perform semantic similarity retrieval. Specifically, for each query, the tool retrieves `n` reference txext documents (where `n` is defined by the `top_n` parameter). Then, all documents retrieved from all queries are merged, and the top `n`