In [None]:
import os
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm

# ============ 路径设置 ============
parquet_path = '../data/raw/p2_muse512/data/train-00000-of-00001-d56c6394e70cbcea.parquet'
output_image_dir = '../examples/p2/prompt_images'
output_text_path = '../examples/p2/sentences.txt'

os.makedirs(output_image_dir, exist_ok=True)

# ============ 读取 parquet ============
df = pd.read_parquet(parquet_path)

# ============ 遍历并处理 ============
sentences = []
img_idx = 0

for idx, row in tqdm(df.iterrows(), total=len(df)):
    sentence = row.get('sentence_text_knowledge') or row.get('Prompt', '').strip()
    image_dict = row['images']

    if not sentence:
        continue

    # 图像解码
    if isinstance(image_dict, dict) and 'bytes' in image_dict:
        try:
            img = Image.open(BytesIO(image_dict['bytes'])).convert("RGB")
        except Exception as e:
            print(f"❌ 图像解析失败 idx={idx}: {e}")
            continue
    else:
        print(f"❌ 无效图像格式 idx={idx}")
        continue

    # 保存图像为 0.png, 1.png, ...
    img_path = os.path.join(output_image_dir, f"{img_idx}.png")
    try:
        img.save(img_path, format="PNG")
    except Exception as e:
        print(f"❌ 图像保存失败 idx={idx}: {e}")
        continue

    # 保存文本（仅保留句子）
    sentence = sentence.replace("\n", " ").replace("\t", " ").strip()
    sentences.append(sentence)

    img_idx += 1

# 写入纯文本文件
with open(output_text_path, "w", encoding="utf-8") as fout:
    fout.write("\n".join(sentences))

print(f"✅ 完成！共保存 {img_idx} 张图像和对应文本到：\n📁 {output_image_dir}\n📄 {output_text_path}")

100%|██████████| 1632/1632 [00:26<00:00, 62.66it/s]

✅ 完成！共保存 1632 张图像和对应文本到：
📁 ../examples/prompt_images
📄 ../examples/sentences.txt





In [11]:
import json
import os

# === 参数区（你可以改这里）===
jsonl_path = "../data/raw/t2i_factualbench/data/prompts/text_injection/SKCM/prompt.jsonl"   # 原始jsonl路径
output_path = "../examples/SKCM/sentences.txt"  # 输出文件路径
n = 612  # 要提取的条目数量 n+1 条（即从第0到第n条）

# 创建输出文件夹（如果没有）
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# 开始处理
with open(jsonl_path, 'r', encoding='utf-8') as f_in, open(output_path, 'w', encoding='utf-8') as f_out:
    for i, line in enumerate(f_in):
        if i > n:
            break
        try:
            data = json.loads(line.strip())
            sentence = data.get("sentence_text_knowledge", "")
            if sentence:
                f_out.write(sentence + '\n')
        except json.JSONDecodeError as e:
            print(f"JSON解析失败 at line {i}: {e}")
