In [1]:
import os
import xml.etree.ElementTree as ET
from collections import defaultdict

def parse_story_files(story_folder):
    story_text_map = {}
    for file in os.listdir(story_folder):
        if file.startswith("Story_") and file.endswith(".xml"):
            tree = ET.parse(os.path.join(story_folder, file))
            root = tree.getroot()
            story_id = root.attrib.get("Self")
            texts = []
            for content in root.iter():
                if content.tag.endswith("Content"):
                    texts.append(content.text)
            if texts:
                story_text_map[story_id] = "\n".join(filter(None, texts))
    return story_text_map

def parse_spread_files(spread_folder):
    frames = []
    for file in os.listdir(spread_folder):
        if file.startswith("Spread_") and file.endswith(".xml"):
            tree = ET.parse(os.path.join(spread_folder, file))
            root = tree.getroot()

            for textframe in root.findall(".//TextFrame"):
                frame_info = {
                    "type": "text",
                    "id": textframe.attrib.get("Self"),
                    "story_id": textframe.attrib.get("ParentStory"),
                }
                frames.append(frame_info)

            for image in root.findall(".//Image"):
                image_id = image.attrib.get("Self")
                link = image.find(".//Link")
                link_uri = link.attrib.get("LinkResourceURI") if link is not None else "unknown"
                frame_info = {
                    "type": "image",
                    "id": image_id,
                    "link_uri": link_uri
                }
                frames.append(frame_info)
    return frames

def extract_idml_content(idml_root):
    story_folder = os.path.join(idml_root, "Stories")
    spread_folder = os.path.join(idml_root, "Spreads")

    story_map = parse_story_files(story_folder)
    frames = parse_spread_files(spread_folder)

    output = []
    for frame in frames:
        if frame["type"] == "text":
            text = story_map.get(frame["story_id"], "[No text found]")
            output.append({
                "type": "text",
                "story_id": frame["story_id"],
                "frame_id": frame["id"],
                "content": text
            })
        elif frame["type"] == "image":
            output.append({
                "type": "image",
                "frame_id": frame["id"],
                "link_uri": frame["link_uri"]
            })

    return output

# === CONFIGURATION ===
idml_folder = "mc2410_068069ukCAa1a"  # Change this to your folder path

# === EXECUTION ===
result = extract_idml_content(idml_folder)

# === DISPLAY ===
for item in result:
    if item["type"] == "text":
        print(f"\n📝 Text Frame [{item['frame_id']}] (Story {item['story_id']}):\n{item['content']}")
    elif item["type"] == "image":
        print(f"\n🖼️ Image Frame [{item['frame_id']}]: linked to {item['link_uri']}")



📝 Text Frame [u1e1b] (Story u1e1e):
[No text found]

📝 Text Frame [u1be0] (Story u1be3):
[No text found]

📝 Text Frame [u1bf8] (Story u1bfb):
[No text found]

📝 Text Frame [u1db7] (Story u1dba):
[No text found]

📝 Text Frame [u1dcf] (Story u1dd2):
[No text found]

📝 Text Frame [u1de7] (Story u1dea):
[No text found]

📝 Text Frame [u1dff] (Story u1e03):
[No text found]

📝 Text Frame [u1e33] (Story u1e36):
[No text found]

📝 Text Frame [u1e4b] (Story u1e36):
[No text found]

📝 Text Frame [u1e4f] (Story u1e54):
[No text found]

📝 Text Frame [u1ea7] (Story u1eaa):
[No text found]

📝 Text Frame [u1ebf] (Story u1ec2):
[No text found]

📝 Text Frame [u1ed7] (Story u1eda):
[No text found]

📝 Text Frame [u1efb] (Story u1efe):
[No text found]

📝 Text Frame [u1f13] (Story u1f16):
[No text found]

📝 Text Frame [u1f2b] (Story u1f2e):
[No text found]

📝 Text Frame [u1f46] (Story u1f49):
[No text found]

📝 Text Frame [u1f5e] (Story u1f63):
[No text found]

📝 Text Frame [u1f79] (Story u1f7c):
[No text 

In [1]:
import fitz
from pathlib import Path

# Paths
pdf_dir = "dataSources/2410/pdfs"
output_dir = "dataSources/2410/images"
path_pdf_dir = Path(pdf_dir)
path_output_dir = Path(output_dir)
path_output_dir.mkdir(parents=True, exist_ok=True)

# Loop over all PDF files in the folder
for pdf_path in sorted(path_pdf_dir.glob("*.pdf")):
    # Convert PDF to list of images (one per page)
    doc = fitz.open(pdf_path)
    page = doc[0]

    zoom = 2.0  # 2.0 → 150 DPI (approx)
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)

    # Save image
    image_path = path_output_dir / f"{pdf_path.stem}.jpg"
    pix.save(str(image_path))
    print(f"Saved: {image_path}")


Saved: dataSources\2410\images\fg2410_023.jpg
Saved: dataSources\2410\images\fg2410_024.jpg
Saved: dataSources\2410\images\fg2410_025.jpg
Saved: dataSources\2410\images\fg2410_027.jpg
Saved: dataSources\2410\images\fg2410_028.jpg
Saved: dataSources\2410\images\fg2410_029.jpg
Saved: dataSources\2410\images\fg2410_031.jpg
Saved: dataSources\2410\images\fg2410_032.jpg
Saved: dataSources\2410\images\fg2410_033.jpg
Saved: dataSources\2410\images\fg2410_034.jpg
Saved: dataSources\2410\images\fg2410_037.jpg
Saved: dataSources\2410\images\fg2410_038.jpg
Saved: dataSources\2410\images\fg2410_039.jpg
Saved: dataSources\2410\images\fg2410_040.jpg
Saved: dataSources\2410\images\fg2410_042.jpg
Saved: dataSources\2410\images\fg2410_043.jpg
Saved: dataSources\2410\images\fg2410_044.jpg
Saved: dataSources\2410\images\fg2410_045.jpg
Saved: dataSources\2410\images\fg2410_046.jpg
Saved: dataSources\2410\images\fg2410_047.jpg
Saved: dataSources\2410\images\fg2410_048.jpg
Saved: dataSources\2410\images\fg2

In [5]:
chunks = [
    "Introducing Tudor And The Marine Nationale Go Worldwide With The Pelagos FXD GMT 'Zulu Time'\nAn FXD for the skies.",
    "What We Know\nAt 12:00 Zulu Time, Tudor went hot with a new Pelagos FXD, a watch originally developed with the French Naval special forces unit Commando Hubert as a purpose-built tool for underwater navigation and buddy diving. That watch became a platform for a variety of releases, including sailing chronographs, cycling chronographs, and a bunch of others. Tudor has now gone back to the core inspiration for another watch built for the rigors of the Aéronautique Navale with a new GMT version of the Pelagos FXD.\nCased in Grade 2 titanium, measuring 42mm by 12.7mm with a 52mm lug-to-lug fixed strap-bar design, the watch looks and feels similar to several previous Tudor releases, all combined into one tough-looking package. The watch features a matte black dial and beige applied indices (with blue emission lume), an orange GMT hand (with green emission lume matching the GMT bezel), and red accents for the product name on the dial. The bezel is also in titanium with a ceramic bi-directional GMT insert. The caseback features the engraving 'M.N.24' (Marine Nationale 2024) as a callback to the original military-issued Tudors of the '70s and '80s.\nInside, you'll find the Manufacture Calibre MT5652-U automatic 'flyer' GMT movement with 65 hours of power reserve and an independent jumping hour GMT hand. It's also METAS-certified. The watch comes on a 'flight-suit green,' one-piece fabric strap with a grade 2 titanium pin buckle and keeper. It also has an extra removable fabric keeper with the French Naval Aviation roundel if you really want to lean into the French military theme. The watch is listed at $4,625 and isn't a limited edition.",
    "What We Think\nWhen Tudor started teasing this release, I was pretty certain it would be a titanium version of the FXD Chrono platform because of the emphasis they put on helicopters. A flyback chronograph for pilots would have made a lot of sense and shown the breadth of what the Marine Nationale and, specifically, the Commando Hubert do on a mission basis, including airborne insertions. On the other hand, the Pelagos FXD GMT makes a lot of sense as a complementary tool for every member of the team, especially when they're operating boots dry or just on station in a land-based combat zone.\nMy immediate reaction to the image on Instagram was that I had to get one as soon as possible. I regret not picking up the original Pelagos FXD with the M.N.21 caseback upon release, and as someone who wears a GMT almost daily, I've been waiting for the right Tudor GMT to join the collection. This is like a mashup of that original FXD M.N. and the LHD (Left-Hand Drive). When I saw it was the older MT5652 movement, I paused. It's a solid movement, with a flyer GMT, but the Black Bay Pro has that same movement and an interesting design but falls on the chunkier side.\nThe new Pelagos FXD GMT feels more reasonable, with a wider 42mm case but thinner at 12.7mm (versus 14.6mm for the Pro), which isn't too far off from my Rolex 126710BLRO. The larger case size might also balance out nicely with the Grade 2 Titanium. I'm going to wait to reserve final judgment (and payment) until I finally get my hands on one in person, but even more importantly, we should get a follow-up from Hodinkee's King of Tudor, James Stacey, as soon as he can get his Pelagos-loving hands on one.",
    "The Basics\nBrand: Tudor\nModel: Pelagos FXD GMT\nReference Number: 2542G247NU\nDiameter: 42mm\nThickness: 12.7mm\nLug-to-Lug: 52mm\nCase Material: Grade 2 titanium\nDial Color: Matte black\nIndexes: Beige applied hour markers with matching 'Snowflake' handset\nLume: Blue and green emission Super-LumiNova\nWater Resistance: 200mm\nStrap/Bracelet: Flight-suit green one-piece fabric strap with grade 2 titanium pin buckle and keeper; extra removable fabric keeper with the French Naval Aviation roundel",
    "The Movement\nCaliber: Manufacture Calibre MT5652-U\nFunctions: Hours, minutes, seconds, date, flyer GMT\nDiameter: 31.8mm\nThickness: 7.5mm\nPower Reserve: 65 hours\nWinding: Automatic\nFrequency: 28,800 vph\nJewels: 28 jewels\nChronometer Certified: Yes, METAS certification",
    "Pricing & Availability\nPrice: $4,625\nAvailability: Immediately\nLimited Edition: No\nFor more, click here.",
]
print(len(chunks))

6


In [6]:
for i, chunk in enumerate(chunks):

    arr_previous_en = chunks[:i]
    previous_en = "\n\n".join(arr_previous_en)
    print(f'index :: {i}\nprevious en :: {previous_en}')
    # previous_jp = "\n\n".join(translated_chunks)


index :: 0
previous en :: 
index :: 1
previous en :: Introducing Tudor And The Marine Nationale Go Worldwide With The Pelagos FXD GMT 'Zulu Time'
An FXD for the skies.
index :: 2
previous en :: Introducing Tudor And The Marine Nationale Go Worldwide With The Pelagos FXD GMT 'Zulu Time'
An FXD for the skies.

What We Know
At 12:00 Zulu Time, Tudor went hot with a new Pelagos FXD, a watch originally developed with the French Naval special forces unit Commando Hubert as a purpose-built tool for underwater navigation and buddy diving. That watch became a platform for a variety of releases, including sailing chronographs, cycling chronographs, and a bunch of others. Tudor has now gone back to the core inspiration for another watch built for the rigors of the Aéronautique Navale with a new GMT version of the Pelagos FXD.
Cased in Grade 2 titanium, measuring 42mm by 12.7mm with a 52mm lug-to-lug fixed strap-bar design, the watch looks and feels similar to several previous Tudor releases, all

In [None]:
import json

def format_label_studio_for_donut(label_studio_json_path, output_jsonl_path, image_dir_prefix=""):
    """
    Converts Label Studio JSON export to JSONL format for Donut model fine-tuning.
    Handles a 'PageIndex' label by including it in 'gt_parses' by default,
    or can be modified to extract it as a top-level key.

    Args:
        label_studio_json_path (str): Path to the Label Studio exported JSON file.
        output_jsonl_path (str): Path to save the formatted JSONL file.
        image_dir_prefix (str): Optional prefix for image file names
                                (e.g., "path/to/your/images/").
    """
    formatted_data = []

    with open(label_studio_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for task in data:
        if not task.get('annotations'):
            print(f"Skipping task ID {task.get('id')} as it has no annotations.")
            continue

        annotation = task['annotations'][0]
        results = annotation['result']

        image_file_name = task.get('file_upload')
        if not image_file_name:
            image_path_in_label_studio = task.get('data', {}).get('image', "")
            if image_path_in_label_studio:
                image_file_name = image_path_in_label_studio.split('/')[-1]
            else:
                print(f"Warning: Could not determine image file name for task ID {task.get('id')}")
                image_file_name = f"unknown_image_{task.get('id')}.jpg"

        full_image_path = f"{image_dir_prefix}{image_file_name}"

        bboxes_info = {}
        texts_info = {}

        for res_item in results:
            item_id = res_item.get('id')
            if not item_id:
                continue

            if res_item['type'] == 'rectanglelabels' and 'value' in res_item and 'rectanglelabels' in res_item['value']:
                label = res_item['value']['rectanglelabels'][0]
                bboxes_info[item_id] = {"label": label}
            elif res_item['type'] == 'textarea' and 'value' in res_item and 'text' in res_item['value']:
                text_content = "\n".join(res_item['value']['text'])
                texts_info[item_id] = {"text": text_content}

        gt_parses_list = []
        page_index_text = None # For extracting PageIndex as a separate field

        for item_id, bbox_data in bboxes_info.items():
            if item_id in texts_info:
                label = bbox_data["label"]
                text = texts_info[item_id]["text"]

                # --- Optional: Modification for PageIndex as a separate field ---
                # If you want PageIndex as a top-level key like "page_index": "123"
                # in the ground_truth JSON, uncomment and adapt the following:
                #
                # if label.lower() == 'pageindex':
                #     page_index_text = text
                #     # Optionally, skip adding it to gt_parses_list if it's top-level only
                #     # continue
                # --- End Optional Modification ---

                gt_item = {
                    "label": label,
                    "text": text
                }
                gt_parses_list.append(gt_item)
            else:
                # Handle cases where a bbox might not have corresponding text
                label = bbox_data["label"]
                gt_item = {
                    "label": label,
                    "text": "" # Placeholder for image regions without text
                }
                # Allow "Image" (or similar visual-only labels) to have no text
                if label.lower() == "image":
                    gt_parses_list.append(gt_item)
                # If PageIndex must have text, it would fall into the warning below
                # Or, if PageIndex *can* be just a bbox without text (unlikely for page number):
                # elif label.lower() == 'pageindex':
                #    gt_parses_list.append(gt_item) # If allowing PageIndex without text
                else:
                    print(f"Warning: Bounding box with ID {item_id} and label '{label}' has no corresponding text in task ID {task.get('id')} for image {image_file_name}.")

        # Create the ground_truth structure
        ground_truth_dict = {"gt_parses": gt_parses_list}

        # --- Optional: Add PageIndex as a top-level key if extracted ---
        # if page_index_text is not None:
        #     ground_truth_dict["page_index"] = page_index_text
        # --- End Optional ---
        
        ground_truth_json_string = json.dumps(ground_truth_dict, ensure_ascii=False)

        formatted_data.append({
            "file_name": full_image_path,
            "ground_truth": ground_truth_json_string
        })

    with open(output_jsonl_path, 'w', encoding='utf-8') as outfile:
        for entry in formatted_data:
            json.dump(entry, outfile, ensure_ascii=False)
            outfile.write('\n')

    print(f"Successfully formatted data and saved to {output_jsonl_path}")

# --- How to use ---
# 1. Save your Label Studio export as a .json file.
# 2. Update the paths in the example usage below.

# Example Usage:
# Use the filename of your updated JSON export
label_studio_file = 'project-1-at-2025-05-17-11-36-ccf01f10.json' # Path to your updated Label Studio JSON export
output_file = 'donut_finetune_data_with_pageindex.jsonl' # Desired output path

image_directory_prefix = "" # Adjust if needed, e.g., "data/fashion_magazines/"

# Ensure the JSON file is accessible by the script.
format_label_studio_for_donut(label_studio_file, output_file, image_directory_prefix)

In [None]:
import json

# Assuming `data` contains the loaded JSON structure
with open("outputs/bbre/fg2410_174175BbreA1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

all_texts = []

# Iterate through each page
for page in data.get("pages", []):
    texts = page.get("texts", [])
    for text_entry in texts:
        content = text_entry.get("Content")
        if content:
            all_texts.append(content)

# Join and print all text content
full_text = "\n\n".join(all_texts)
print(full_text)

# Optional: Save to a file
with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(full_text)

「 乳がん 」
最新白書

刻々と進化する最新知見情報をキャッチアップ！

9人に1人がかかる身近な病気である乳がん。
乳がん治療はここ十数年で大きく進化しています。
罹患者数は増えていますが、
早期乳がんの割合が半数を超え、
治るがんになりました。
最新情報を更新して、
あなたのがん対策に役立ててください。流行中のハンサムを演出する女性に、
スーツほど相応しいアイテムはないでしょう。
イタリアの女性は上手にスーツを
着こなします。彼女たちはモード系や
デザイナーズの服ばかりを着ているのでは
ありません。ク１００wラシコイタリアが
ブームになってから、テーラードスーツ姿の女性を
見かけることが多くなりました。彼女たち

取材・文＝増田美加（女性医療ジャーナリスト）
イラスト＝River Rie（Softdesign）
写真協力＝Ggetty Iimages Shutterstock
編集＝石黒三惠（本誌）

いま、乳がんは医療の発展に伴い、
治るがんに
なっています。
乳がんほど、“患者ファースト”が
進んでいる
医療現場はありません。
心身ともに患者の負担を減らすための
個別化治療も進んでいます。
女性である以上、身近な病気であり続ける乳がん。
治療法も再建法も大きく進化していますが、
早期発見が何より
大切であることは変わっていません。

が乳がんに罹患したのは２００６年、いまから18年前。乳がんの取材活動は約30年前から行っていますが、この十数年で乳がんの検診、治療、ケア現場は大きく変わったと思います。乳がんの医療者は、患者とともに治療を行っていく姿勢がほかのがん治療に比べて突出しています。乳がんは患者ファーストが最も進んでいるがん医療現場で、患者とのコミュニケーションを特に大事にします。乳がん治療の医療者間で急速に広がっている患者と医療者の共同意思決定「シェアード・ディシジョン・メイキング（ＳＤМ）」という考え方もそのことを表しています（Ｐ１７９参照）。
その理由は、乳がんがほかのがんと比べて若い世代に多く、もの言う患者やサバイバーであるからかもしれません。ほかのがんは60代、70代、80代と年齢が上がるごとに増えていきますが、乳がんはちょっと違います。妊娠、出産前の若い年代にも発症し、ピークは働き盛りの40代から60代なのです（最近は70代以上の罹患も増え

In [5]:
import json
import argparse
import os

def sort_text_blocks(texts, tolerance=10):
    """
    Sorts a list of text block dictionaries based on natural reading order
    for Japanese magazines (top-to-bottom, right-to-left).

    Args:
        texts (list): A list of text dictionaries, each with a 'Bounds' key.
        tolerance (int, optional): The vertical tolerance in points to consider
                                   text blocks as being on the same line.
                                   Defaults to 10.

    Returns:
        list: The sorted list of text dictionaries.
    """
    if not texts:
        return []

    # Sort primarily by the top coordinate (y1) to establish vertical order
    texts.sort(key=lambda t: t['Bounds']['y1'])

    sorted_texts = []
    current_row = []
    if texts:
        current_row.append(texts[0])

    # Group texts into rows based on vertical proximity
    for i in range(1, len(texts)):
        prev_item = current_row[0]
        current_item = texts[i]
        # If the top of the current item is close to the top of the previous item,
        # consider them part of the same row.
        if abs(current_item['Bounds']['y1'] - prev_item['Bounds']['y1']) < tolerance:
            current_row.append(current_item)
        else:
            # Finalize the previous row: sort it right-to-left
            current_row.sort(key=lambda t: -t['Bounds']['x2']) # Sort by right edge, descending
            sorted_texts.extend(current_row)
            # Start a new row
            current_row = [current_item]

    # Don't forget to process the last row
    if current_row:
        current_row.sort(key=lambda t: -t['Bounds']['x2']) # Sort by right edge, descending
        sorted_texts.extend(current_row)
        
    return sorted_texts

input_file_path = 'outputs/kot/fg2410_054061kotB2a.json'

def test():
    try:
        with open(input_file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
        print(f"📄 Successfully read data from '{input_file_path}'")
    except (json.JSONDecodeError, IOError) as e:
            print(f"❌ Error reading or parsing JSON from '{input_file_path}': {e}")
            return

    if "pages" not in data:
        print("❌ Error: Input JSON does not contain a 'pages' key.")
        return

    # Process each page
    for page in data["pages"]:
        if "texts" in page and page["texts"]:
            print(f"  Sorting texts for page '{page.get('page_name', 'Unknown')}'...")
            sorted_texts_for_page = sort_text_blocks(page["texts"])
            texts = [item['Content'] for item in sorted_texts_for_page]
            print(f'----------\n{texts}')
            # page["texts"] = sorted_texts_for_page

test()

📄 Successfully read data from 'outputs/kot/fg2410_054061kotB2a.json'
  Sorting texts for page '054'...
----------
['第1章', '古いものを\n今様に楽しむ\n骨董術', '時間の試練に耐えて残ってきた古いものだけがもつ魅力。\nそれを博物館のガラス越しに眺めるのではなく、\n現代の生活に取り入れることにこそ、骨董の醍醐味があります。\nここでは４人の骨董の達人たちに、いままさに暮らしを\nともにしている骨董の品々を披露していただきました。ナヤド造船所でつくられた全長メートルの\nヨットアルバトロス号がスウェーデンの\n東南端にあるカルマー５０字出帆した。２回目の\n初心者でも骨董生活を始めたくなるヒントが満載です。長距離航海に乗り出したのはシ\nョーバーグ一家アイリー夫のヘンリック人の\n子供歳１００字ン９歳の', 'お手本は“骨董の達人”']
  Sorting texts for page '055'...
----------
['“物”として強く、純粋な骨董。\n本当のお気に入りを華美でなく、\n生活に馴染むものを選んで使って、飾って、馴染んでいくう', '内田鋼一さん＆風知さん\n父［陶芸家］、娘［ミュージアム店長・菓子作家パティシエ］', '撮影＝木寺紀雄\u3000編集・文＝柏木敦子（本誌）', '日本スペイン製\n大正時代のたいしょ1900年代の木製ドア', '１８世紀0000年代のハンガリー製乳白カットガラス瓶\nハンガリー製ガラス瓶', 'かつてはゲストルームとして使っていたという広々としたリビングに、家具から器まで、さまざまな骨董が配されている。さていくつある？シンプルなかたちかつ黒色という黒色２０字あるスタイルに３０字だけに、どんな４０字マッチする懐深５０字っている。また６０字くりにヒールを７０字しているために８０字きやすいシンプルなかたちのにかつ１００字うシックなスタイル。それだけにど１２０字もマッチする懐の深さを持っている。また、厚めのつくり１５０字を周する航海計画を立てている小さなヨットの狭い空間に１８０字合う家族にとっては楽しいと同時に２００字に満ちた旅に違いないだが、アイリーショーバーグは楽天的に考える。あ判断力強靭

In [None]:
[
    "“物”として強く、純粋な骨董。\n本当のお気に入りを華美でなく、\n生活に馴染むものを選んで使って、飾って、馴染んでいくう",
    "内田鋼一さん＆風知さん\n父［陶芸家］、娘［ミュージアム店長・菓子作家パティシエ］",
    "撮影＝木寺紀雄\u3000編集・文＝柏木敦子（本誌）",
    "日本スペイン製\n大正時代のたいしょ1900年代の木製ドア",
    "１８世紀0000年代のハンガリー製乳白カットガラス瓶\nハンガリー製ガラス瓶",
    "かつてはゲストルームとして使っていたという広々としたリビングに、家具から器まで、さまざまな骨董が配されている。さていくつある？シンプルなかたちかつ黒色という黒色２０字あるスタイルに３０字だけに、どんな４０字マッチする懐深５０字っている。また６０字くりにヒールを７０字しているために８０字きやすいシンプルなかたちのにかつ１００字うシックなスタイル。それだけにど１２０字もマッチする懐の深さを持っている。また、厚めのつくり１５０字を周する航海計画を立てている小さなヨットの狭い空間に１８０字合う家族にとっては楽しいと同時に２００字に満ちた旅に違いないだが、アイリーショーバーグは楽天的に考える。あ判断力強靭な精神、それにユ２５０字センスがあれば問題ないはずよ以下は年の初これから最難関のホーン岬通過を前にして自分たちの為に３００字ナヤド造船所でつくられた全長メートルのヨットアルバトロス号がスウェーデンの東南端にあるカルマール港を出帆した。２回目の長距離航海に乗り出したのはショーバーグ一家アイリー夫のヘンリック人の子供歳４００字",
]

In [4]:
import json

with open("donut.json", "r") as file:
    data = json.load(file)

rose = None
for item in data:
    if item['id'] == 85:
        rose = item
        break
print(rose)

{'id': 85, 'annotations': [{'id': 70, 'completed_by': 1, 'result': [{'original_width': 1463, 'original_height': 1832, 'image_rotation': 0, 'value': {'x': 82.04716931843677, 'y': 9.071794412301044, 'width': 6.22688271387553, 'height': 26.477847347756757, 'rotation': 0, 'rectanglelabels': ['Title']}, 'id': '12u-5QZ7Bt', 'from_name': 'bbox', 'to_name': 'image', 'type': 'rectanglelabels', 'origin': 'manual'}, {'original_width': 1463, 'original_height': 1832, 'image_rotation': 0, 'value': {'x': 82.04716931843677, 'y': 9.071794412301044, 'width': 6.22688271387553, 'height': 26.477847347756757, 'rotation': 0, 'text': ['日本人とバラ']}, 'id': '12u-5QZ7Bt', 'from_name': 'text_content', 'to_name': 'image', 'type': 'textarea', 'origin': 'manual'}, {'original_width': 1463, 'original_height': 1832, 'image_rotation': 0, 'value': {'x': 68.4612433972538, 'y': 8.912055936562032, 'width': 12.858108461119599, 'height': 27.059068387097767, 'rotation': 0, 'rectanglelabels': ['TextBlock']}, 'id': 'Ng6ORmO3Vf', 'f

In [None]:
annotation = rose.get('annotations')
first_item = annotation[0]
result = first_item.get('result')
grouped_annotations = {}
for item in result:
    original_width = item.get('original_width')
    original_height = item.get('original_height')
    value = item.get('value')
    if value:
        x = value.get('x')
        y = value.get('y')
        width = value.get('width')
        height = value.get('height')
        # 
        x1_abs = int(x/100*original_width)
        y1_abs = int(y/100*original_height)
        x2_abs = int((x+width)/100*original_width)
        y2_abs = int((y+height)/100*original_height)
        bbox = {'x1':x1_abs,'y1':y1_abs,'x2':x2_abs,'y2':y2_abs}
        print(f'value : {value}\nbbox : {bbox}')
    else:
        print('value is None')  


value : {'x': 82.04716931843677, 'y': 9.071794412301044, 'width': 6.22688271387553, 'height': 26.477847347756757, 'rotation': 0, 'rectanglelabels': ['Title']}
bbox : {'x1': 1200.35008712873, 'y1': 166.1952736333551, 'x2': 1291.449381232729, 'y2': 651.269437044259}
value : {'x': 82.04716931843677, 'y': 9.071794412301044, 'width': 6.22688271387553, 'height': 26.477847347756757, 'rotation': 0, 'text': ['日本人とバラ']}
bbox : {'x1': 1200.35008712873, 'y1': 166.1952736333551, 'x2': 1291.449381232729, 'y2': 651.269437044259}
value : {'x': 68.4612433972538, 'y': 8.912055936562032, 'width': 12.858108461119599, 'height': 27.059068387097767, 'rotation': 0, 'rectanglelabels': ['TextBlock']}
bbox : {'x1': 1001.587990901823, 'y1': 163.2688647578164, 'x2': 1189.7021176880028, 'y2': 658.9909976094476}
value : {'x': 68.4612433972538, 'y': 8.912055936562032, 'width': 12.858108461119599, 'height': 27.059068387097767, 'rotation': 0, 'text': ['日本では、いつからバラが存在していたのでしょう。日本原種のバラが欧米のバラと交配されて新たな品種を生んだ歴史や、時代ごとの愛でられ方な

In [10]:
import json
import uuid

# Load Label Studio export
with open("donut.json", "r", encoding="utf-8") as f:
    ls_data = json.load(f)

converted = []

for item in ls_data:
    result = item["annotations"][0]["result"]
    page_name = item["file_upload"].split("-")[-1].replace(".jpg", "").replace(".jpeg", "")

    texts = []
    images = []

    # Map to find textarea contents
    id_map = {r['id']: r for r in result if 'id' in r}

    # Find page_id from PageIndex
    page_id = None
    for r in result:
        if r["type"] == "rectanglelabels" and r["value"]["rectanglelabels"][0].lower() == "pageindex":
            text_result = next((t for t in result if t["type"] == "textarea" and t["id"] == r["id"]), None)
            if text_result:
                try:
                    page_id = int(text_result["value"]["text"][0].strip())
                except:
                    page_id = None
            break
    if page_id is None:
        page_id = -1  # fallback if pageindex is not found

    for r in result:
        if r["type"] == "rectanglelabels":
            bbox = r["value"]
            orig_w = r["original_width"]
            orig_h = r["original_height"]

            x1 = bbox["x"] * orig_w / 100
            y1 = bbox["y"] * orig_h / 100
            x2 = x1 + (bbox["width"] * orig_w / 100)
            y2 = y1 + (bbox["height"] * orig_h / 100)

            box = {
                "Bounds": {
                    "x1": x1,
                    "y1": y1,
                    "x2": x2,
                    "y2": y2
                }
            }

            label = r["value"]["rectanglelabels"][0]
            uid = str(uuid.uuid4())[:7]

            # Find corresponding text content
            matching_text = next(
                (t for t in result if t["type"] == "textarea" and t["id"] == r["id"]), None
            )
            content = matching_text["value"]["text"][0] if matching_text else ""

            if label.lower() == "image":
                box["URI"] = "sc:" + uid
                box["ImageId"] = uid
                images.append(box)
            elif label.lower() != "pageindex":
                box["Content"] = content
                box["TextId"] = uid
                texts.append(box)

    converted.append({
        "page_name": page_name,
        "page_id": page_id,
        "images": images,
        "texts": texts
    })

# Save output
with open("bbox_test.json", "w", encoding="utf-8") as f:
    json.dump(converted, f, indent=2, ensure_ascii=False)


----- extract from idml file -----

In [None]:
import os
from extract_from_idml import extract

idmls_path = 'idmls'
out_path = 'outputs_idml_json'
idmls = os.listdir(idmls_path)
for folder_path in idmls:
    zips = os.listdir(os.path.join(idmls_path,folder_path))
    for zip_file in zips:
        idml_path = os.path.join(idmls_path,folder_path,zip_file)
        json_file = zip_file.replace('.zip','.json')
        output_path = os.path.join(out_path,folder_path,json_file)
        # print(f'idml file path : {idml_path}')
        # print(f'out path : {output_path}')
        extract(idml_path,output_path)
        

In [None]:
import cv2
import json
from PIL import Image, ImageDraw, ImageFont
import os
output_idml_json_path = 'outputs_idml_json'
original_width = 1463
original_height = 1832
margin = 50
page_width = 658
page_height = 842

for folder_path in os.listdir(output_idml_json_path):
    jsons = os.listdir(os.path.join(output_idml_json_path,folder_path))
    for json_file in jsons:
        with open(os.path.join(output_idml_json_path,folder_path,json_file), "r", encoding="utf-8") as f:
            json_obj = json.load(f)
        list_pages = json_obj['pages']
        for pages_json in list_pages:
            page_name = pages_json['page_name']
            # print(f'page name ---- {page_name}')
            img_path = f'fg2410_{page_name}.jpg'
            if os.path.exists(os.path.join('dataSources/2410/images',img_path)):
                pass
            else:
                print(f'image {img_path} do not exist')
            
            pass    
        

In [None]:
# Crop image margin
from PIL import Image, ImageDraw, ImageFont
import os

folder_path = os.path.join("dataSources", "2410", "images")
margin = 50

for img_file in os.listdir(folder_path):
    img_path = os.path.join(folder_path, img_file)
    # print(img_path)
    image_ori = Image.open(img_path).convert("RGB")
    ori_width, ori_height = image_ori.size
    cropped_image = image_ori.crop(
        (margin, margin, ori_width - margin, ori_height - margin)
    )
    # cropped_image.show()
    cropped_image.save(img_path)

OSError: broken data stream when reading image file

In [None]:
import json
from PIL import Image, ImageDraw, ImageFont
import os

# --- Configuration ---
original_width = 1351
original_height = 1720
margin = 50
page_width = 658
page_height = 842

# Resize scale factors
scale_x = original_width / page_width
scale_y = original_height / page_height


# --- Draw Bounding Boxes ---
def draw_box(draw, bounds, color):
    x1 = bounds["x1"] * scale_x
    y1 = bounds["y1"] * scale_y
    x2 = bounds["x2"] * scale_x
    y2 = bounds["y2"] * scale_y
    draw.rectangle([x1, y1, x2, y2], outline=color, width=2)


# --- Load Image ---
folder_path = os.path.join("dataSources", "2410", "images")
idml_path = "outputs_idml_json"
idml_bboxs_path = "idml_bboxs"
for idml_folder_name in os.listdir(idml_path):
    idml_folder_path = os.path.join(idml_path, idml_folder_name)
    for idml_json_name in os.listdir(idml_folder_path):
        idml_json_path = os.path.join(idml_folder_path, idml_json_name)
        # print(idml_json_path)
        pages_json = None
        if os.path.exists(idml_json_path):
            with open(idml_json_path, "r", encoding="utf-8") as f:
                pages_json = json.load(f)
        if pages_json:
            list_pages = pages_json["pages"]
            for page_obj in list_pages:
                page_name = page_obj["page_name"]
                img_name = f"fg2410_{page_name}.jpg"
                img_path = os.path.join(folder_path, img_name)
                image = Image.open(img_path).convert("RGB")
                draw = ImageDraw.Draw(image)
                for img in page_obj["images"]:
                    draw_box(draw, img["Bounds"], color="red")
                for text in page_obj["texts"]:
                    draw_box(draw, text["Bounds"], color="blue")
                out_img_path = os.path.join(idml_bboxs_path, img_name)
                image.save(out_img_path)
                # print(f'---- out img path ---- {out_img_path}')


# out_path = 'idml_bboxs'
# for file_name in os.listdir(folder_path):
#     img_path = os.path.join(folder_path,file_name)
#     print(img_path)

# def draw_bboxes(image_path,file_name):
#     pass
# image_path = "dataSources/2410/images/fg2410_102.jpg"  # Replace with your actual image path
# out_path = 'idml_bboxs/fg2410_102.jpg'
# # --- IDML Page Data (replace with your actual JSON if needed) ---
# with open("outputs_idml_json/rose/fg2410_102103roseB1n.json", "r", encoding="utf-8") as f:
#     page_obj = json.load(f)
# image = Image.open(image_path).convert("RGB")
# # cropped_image = image.crop((margin, margin, original_width - margin, original_height - margin))
# draw = ImageDraw.Draw(image)


# --- Draw Bounding Boxes ---
# def draw_box(draw,bounds, color):
#     x1 = bounds["x1"] * scale_x
#     y1 = bounds["y1"] * scale_y
#     x2 = bounds["x2"] * scale_x
#     y2 = bounds["y2"] * scale_y
#     draw.rectangle([x1, y1, x2, y2], outline=color, width=3)

# # Draw image boxes (red)
# for img in page_data["images"]:
#     draw_box(img["Bounds"], color="red")

# # Draw text boxes (blue)
# for text in page_data["texts"]:
#     draw_box(text["Bounds"], color="blue")

# # --- Save or Show Result ---
# # output_path = "output_with_boxes.jpg"
# # image.save(out_path)
# image.show()

In [None]:
import os
import json
from PIL import Image, ImageDraw

# --- Configuration ---
original_width = 1351
original_height = 1720
margin = 50
page_width = 658
page_height = 842

# Resize scale factors
scale_x = original_width / page_width
scale_y = original_height / page_height

# --- Draw Bounding Boxes ---
def draw_box(draw, bounds, color):
    x1 = bounds["x1"] * scale_x
    y1 = bounds["y1"] * scale_y
    x2 = bounds["x2"] * scale_x
    y2 = bounds["y2"] * scale_y
    draw.rectangle([x1, y1, x2, y2], outline=color, width=2)

# --- Load Image ---
folder_path = os.path.join("dataSources", "2410", "images")
idml_path = "outputs_idml_json"
idml_bboxs_path = "idml_bboxs"
os.makedirs(idml_bboxs_path, exist_ok=True)  # Make sure output folder exists

for idml_folder_name in os.listdir(idml_path):
    idml_folder_path = os.path.join(idml_path, idml_folder_name)
    for idml_json_name in os.listdir(idml_folder_path):
        idml_json_path = os.path.join(idml_folder_path, idml_json_name)
        print(f'idml folder : {idml_folder_name}  json file : {idml_json_name}')

        if os.path.exists(idml_json_path):
            with open(idml_json_path, "r", encoding="utf-8") as f:
                pages_json = json.load(f)

        if pages_json:
            list_pages = pages_json["pages"]
            for page_obj in list_pages:
                page_name = page_obj["page_name"]
                img_name = f"fg2410_{page_name}.jpg"
                img_path = os.path.join(folder_path, img_name)
                if not os.path.exists(img_path):
                    print(f"[SKIP] Image not found: {img_path}")
                    continue

                image = Image.open(img_path).convert("RGB")  # ✅ FIXED here
                draw = ImageDraw.Draw(image)

                for img in page_obj.get("images", []):
                    bounds = img.get("Bounds")
                    if bounds:
                        draw_box(draw, bounds, color="red")
                        print(f'drawing bbox for {img_name} ---- {bounds}')
                    else:
                        print(f"[WARN] Missing bounds in {img_name}")

                out_img_path = os.path.join(idml_bboxs_path, img_name)
                image.save(out_img_path)
                print(f"[OK] Saved image with boxes: {out_img_path}")

idml folder : bbre  json file : fg2410_174175BbreA1.json
[OK] Saved image with boxes: idml_bboxs\fg2410_174.jpg
drawing bbox for fg2410_175.jpg ---- {'x1': -11.832250648539457, 'y1': -326.5416508343056, 'x2': 1326.1498328989949, 'y2': 913.5217439083151}
drawing bbox for fg2410_175.jpg ---- {'x1': 46.896158636333666, 'y1': 563.1937959211288, 'x2': 123.89124293846965, 'y2': 678.566492336479}
drawing bbox for fg2410_175.jpg ---- {'x1': 36.57110927225449, 'y1': 772.1616604518294, 'x2': 65.47613482223358, 'y2': 801.0666860018084}
[OK] Saved image with boxes: idml_bboxs\fg2410_175.jpg
idml folder : bbre  json file : fg2410_176177BbreB1.json
drawing bbox for fg2410_176.jpg ---- {'x1': 491.81102362204683, 'y1': 641.6573167607298, 'x2': 600.0990236220468, 'y2': 707.8973167607298}
drawing bbox for fg2410_176.jpg ---- {'x1': 590.1343757160338, 'y1': 717.4375720390846, 'x2': 619.0394012660129, 'y2': 746.3425975890636}
[OK] Saved image with boxes: idml_bboxs\fg2410_176.jpg
drawing bbox for fg2410_1