In [13]:
import os
import json
import base64
from tqdm import tqdm
from openai import OpenAI


In [14]:

# 初始化 API 客户端
client = OpenAI(
    base_url='https://api.nuwaapi.com/v1',
    api_key='sk-9j6iUV0GxuMYwI1EKB3RrZ9Gd3ZiLSO1mz7kze2s2ZfL2LUF'
)


In [15]:

# 系统提示：支持 image_description + reasoning + explanations
PROMPT = """You are a helpful and knowledgeable teacher AI designed to answer science VQA (Visual Question Answering) problems. Each problem includes:

- A question
- Multiple answer choices
- The correct choice index
- An image (already analyzed and encoded)

Your task is to:
1. Describe the image thoroughly and precisely. Write the description in a way that allows a student to answer the question **even without seeing the image**. Include all visual elements relevant to the question.
2. Provide a detailed, step-by-step reasoning for solving the question, like a teacher explaining to a student.
3. Analyze **each answer choice**, explaining why each is correct or incorrect.
4. Format your output strictly as JSON, with the following keys:

{
  "image_description": "...",
  "reasoning": "...",
  "explanations": "0: ...\\n1: ...\\n2: Is the correct answer\\n3: ..."
}

Be precise, use technical language when needed, and avoid guessing beyond what the image shows.
"""

In [16]:

# 工具函数：图像转 base64


def encode_image(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


In [17]:
# 文件路径
input_path = "/workspace/Janus_fine_tuning/dataset/unpacked_dataset/TQA/val.jsonl"
output_path = "/workspace/Janus_fine_tuning/dataset/processed_dataset/val_cot_multimodal.jsonl"

In [18]:
from tqdm import tqdm

# 先加载已完成的样本 ID
processed_ids = set()
if os.path.exists(output_path):
    with open(output_path, "r") as fout:
        for line in fout:
            try:
                existing = json.loads(line)
                processed_ids.add(existing["id"])
            except:
                continue

print(f"已完成样本数：{len(processed_ids)}")

# 以追加方式写入结果
with open(input_path, "r") as fin, open(output_path, "a") as fout:
    for line in tqdm(fin):
        sample = json.loads(line)
        if sample["id"] in processed_ids:
            continue  # 跳过已处理样本

        question = sample["question"]
        options = sample["options"]
        answer = sample["answer"]
        image_path = sample["image_path"]

        try:
            base64_img = encode_image(image_path)
        except Exception as e:
            print(f"[跳过图像] {image_path} 出错: {e}")
            continue

        user_content = [
            {
                "type": "text",
                "text": f"""Here is a VQA sample. Please analyze the image and answer accordingly.\n\nQuestion: {question}\nOptions: {options}\nAnswer Index: {answer}\n\nNow generate your response as per the instructions."""
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{base64_img}"
                }
            }
        ]

        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": PROMPT},
                    {"role": "user", "content": user_content}
                ]
            )

            output_text = response.choices[0].message.content.strip()
            output_text = response.choices[0].message.content.strip()

            # 去除 markdown 包裹
            if output_text.startswith("```json"):
                output_text = output_text.lstrip("```json").rstrip("```").strip()
            elif output_text.startswith("```"):
                output_text = output_text.lstrip("```").rstrip("```").strip()

            # 修复尾部多余逗号（防止非法 JSON）
            if output_text.endswith(","):
                output_text = output_text[:-1].rstrip(" \n")

            # 或更安全地修复最后一个 '},' 为 '}'
            if output_text.rstrip().endswith("},"):
                output_text = output_text.rstrip()[:-1] + "\n}"

            try:
                parsed = json.loads(output_text)
            except Exception as je:
                print(f"[解析失败] {sample.get('id')} → JSON解析错误：{je}")
                print("原始输出:\n", output_text)
                continue

            # 保存结果
            sample["image_description"] = parsed["image_description"]
            sample["reasoning"] = parsed["reasoning"]
            sample["explanations"] = parsed["explanations"]
            fout.write(json.dumps(sample, ensure_ascii=False) + "\n")

        except Exception as e:
            print(
                f"[错误] {sample.get('id')} 处理失败：{e}\n Output is: {output_text}")

已完成样本数：224


0it [00:00, ?it/s]

260it [07:20,  1.69s/it]


KeyboardInterrupt: 