In [4]:
import os
import json
import base64
from tqdm import tqdm
from openai import OpenAI


In [5]:

# 初始化 API 客户端
client = OpenAI(
    base_url='https://api.nuwaapi.com/v1',
    api_key='sk-9j6iUV0GxuMYwI1EKB3RrZ9Gd3ZiLSO1mz7kze2s2ZfL2LUF'
)


In [6]:

# 系统提示：支持 image_description + reasoning + explanations
PROMPT = """You are a helpful and knowledgeable teacher AI designed to answer science VQA (Visual Question Answering) problems. Each problem includes:

- A question
- Multiple answer choices
- The correct choice index
- An image (already analyzed and encoded)

Your task is to:
1. Describe the image thoroughly and precisely. Write the description in a way that allows a student to answer the question **even without seeing the image**. Include all visual elements relevant to the question.
2. Provide a detailed, step-by-step reasoning for solving the question, like a teacher explaining to a student.
3. Analyze **each answer choice**, explaining why each is correct or incorrect.
4. Format your output strictly as JSON, with the following keys:

{
  "image_description": "...",
  "reasoning": "...",
  "explanations": "0: ...\\n1: ...\\n2: Is the correct answer\\n3: ..."
}

Be precise, use technical language when needed, and avoid guessing beyond what the image shows.
"""

In [7]:

# 工具函数：图像转 base64


def encode_image(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


In [8]:
# 文件路径
input_path = "/workspace/Janus_fine_tuning/dataset/unpacked_dataset/TQA/val.jsonl"
output_path = "/workspace/Janus_fine_tuning/dataset/processed_dataset/val_cot_multimodal.jsonl"

In [12]:
from tqdm import tqdm

# 先加载已完成的样本 ID
processed_ids = set()
if os.path.exists(output_path):
    with open(output_path, "r") as fout:
        for line in fout:
            try:
                existing = json.loads(line)
                processed_ids.add(existing["id"])
            except:
                continue

print(f"已完成样本数：{len(processed_ids)}")

# 以追加方式写入结果
with open(input_path, "r") as fin, open(output_path, "a") as fout:
    for line in tqdm(fin):
        sample = json.loads(line)
        if sample["id"] in processed_ids:
            continue  # 跳过已处理样本

        question = sample["question"]
        options = sample["options"]
        answer = sample["answer"]
        image_path = sample["image_path"]

        try:
            base64_img = encode_image(image_path)
        except Exception as e:
            print(f"[跳过图像] {image_path} 出错: {e}")
            continue

        user_content = [
            {
                "type": "text",
                "text": f"""Here is a VQA sample. Please analyze the image and answer accordingly.\n\nQuestion: {question}\nOptions: {options}\nAnswer Index: {answer}\n\nNow generate your response as per the instructions."""
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{base64_img}"
                }
            }
        ]

        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": PROMPT},
                    {"role": "user", "content": user_content}
                ]
            )

            output_text = response.choices[0].message.content.strip()
            output_text = response.choices[0].message.content.strip()

            # 去除 markdown 包裹
            if output_text.startswith("```json"):
                output_text = output_text.lstrip("```json").rstrip("```").strip()
            elif output_text.startswith("```"):
                output_text = output_text.lstrip("```").rstrip("```").strip()

            # 修复尾部多余逗号（防止非法 JSON）
            if output_text.endswith(","):
                output_text = output_text[:-1].rstrip(" \n")

            # 或更安全地修复最后一个 '},' 为 '}'
            if output_text.rstrip().endswith("},"):
                output_text = output_text.rstrip()[:-1] + "\n}"

            try:
                parsed = json.loads(output_text)
            except Exception as je:
                print(f"[解析失败] {sample.get('id')} → JSON解析错误：{je}")
                print("原始输出:\n", output_text)
                continue

            # 保存结果
            sample["image_description"] = parsed["image_description"]
            sample["reasoning"] = parsed["reasoning"]
            sample["explanations"] = parsed["explanations"]
            fout.write(json.dumps(sample, ensure_ascii=False) + "\n")

        except Exception as e:
            print(
                f"[错误] {sample.get('id')} 处理失败：{e}\n Output is: {output_text}")

已完成样本数：86


0it [00:00, ?it/s]

137it [10:11, 16.48s/it]

[错误] val_1224 处理失败：Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: \r\nhttps://****/fwlink/  (request id: 20250406235940113410741UTWU6LX5)", 'param': 'prompt', 'code': 'content_filter'}}
 Output is: {
  "image_description": "The image depicts a colorful and detailed diagram of a cell, showcasing various organelles and structures within it. The cell has a visible outer boundary highlighted in an orange-yellow shade. This boundary is the plasma membrane, which is denoted by the letter 'A' in the diagram. Inside the cell, there are several organelles: mitochondria marked as 'C' in bright red, a nucleus represented by the label 'X' and surrounded by a nuclear envelope marked as 'T', and various other components such as ribosomes, endoplasmic reticulum, and cytoplasm. Each part of the 

171it [16:02,  8.73s/it]

[解析失败] val_1158 → JSON解析错误：Invalid control character at: line 4 column 746 (char 1597)
原始输出:
 {
  "image_description": "The image depicts a bacterium, which is represented in a simplified way. The bacterium is elongated and has various labeled parts. There are long, thin hair-like structures protruding from its surface, representing pili. Each labeled part, marked with letters A, E, L, Y, and C, points to different structures of the bacterium. The structure labeled 'Y' specifically corresponds to the pilus, which is typically observed as a filamentous appendage that aids in adhesion and conjugation in bacteria. The inner portion of the bacterium contains a coiled structure, which represents the genetic material, surrounded by the cytoplasm and a membrane. The labels are positioned such that they indicate the respective structures they refer to, and 'Y' being associated with the pilus is the most notable distinguishing feature.",
  
  "reasoning": "To determine which label refers to the

193it [19:20,  8.50s/it]

[解析失败] val_2090 → JSON解析错误：Expecting property name enclosed in double quotes: line 5 column 1 (char 1507)
原始输出:
 {
  "image_description": "The image depicts a diagram of the cell cycle, represented as a circular flowchart. Inside the circle, there are four distinct phases labeled as G1, S, G2, and M. Each phase is indicated with a colored arrow connecting them in a clockwise direction. The G1 phase is at the left, followed by the S phase at the top, G2 to the right, and the M phase at the bottom of the cycle. In the center of the diagram, there is a bold label that reads 'CELL CYCLE'. The cells shown are stylized with visible nuclei, and the division process during the M phase is indicated by two cells with condensed chromosomes.",
  "reasoning": "To answer the question about how many phases the cell cycle is made up of, we first recognize that the cell cycle traditionally includes four main phases: G1 phase (cell growth), S phase (DNA synthesis), G2 phase (preparation for mitosis), an

223it [25:08,  9.31s/it]

[解析失败] val_1184 → JSON解析错误：Expecting property name enclosed in double quotes: line 5 column 1 (char 2363)
原始输出:
 {
  "image_description": "The image displays a colorful, labeled diagram of a cell with several components identified by letters. The central part of the cell contains a large oval structure, which represents the nucleus, with a smaller circular structure inside it that may represent a nucleolus. Surrounding the nucleus are various organelles and structures. Notably, there are two bean-shaped structures that are typically indicative of mitochondria. The letters labeling the different parts are positioned with arrows indicating which part they refer to: 'A', 'E', 'D', and 'W'. The letter 'X' is placed towards the bottom left, while 'L' is towards the lower right area of the cell. Importantly, the labeling does not extend to include a structure that would clearly represent ribosomes, which are usually small dots or granules in a cell, typically seen in the cytoplasm or on the 

228it [25:47,  6.79s/it]


KeyboardInterrupt: 