In [None]:
# 原始檔
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

def extract_with_marker(pdf_path):
    converter = PdfConverter(artifact_dict=create_model_dict())
    rendered = converter(pdf_path)
    text, _, images = text_from_rendered(rendered)
    return text

def save_markdown(content, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(content)

# 使用範例
pdf_path = "your_file.pdf"
output_path = "your_file.md"
md_text = extract_with_marker(pdf_path)
save_markdown(md_text, output_path)
print(f"✅ Markdown saved to {output_path}")


### 以下都是白做工,若需要直接用sider.ai解析就可以.

In [None]:
# from marker.schema import BlockTypes
# # 原始檔
# from marker.converters.pdf import PdfConverter
# from marker.models import create_model_dict
# from marker.output import text_from_rendered

# def extract_with_marker(pdf_path):
#     converter = PdfConverter(artifact_dict=create_model_dict())
#     rendered = converter(pdf_path)
#     text, _, images = text_from_rendered(rendered)
#     return text

# def save_markdown(content, output_path):
#     with open(output_path, "w", encoding="utf-8") as f:
#         f.write(content)

# # 使用範例
# pdf_path = "your_file.pdf"
# output_path = "your_file.md"
# md_text = extract_with_marker(pdf_path)
# save_markdown(md_text, output_path)
# print(f"✅ Markdown saved to {output_path}")


In [None]:
# from marker.converters.ocr import OCRConverter
# from marker.models import create_model_dict

# converter = OCRConverter(
#     artifact_dict=create_model_dict(),
# )
# rendered = converter("FUNCTIONBLOCKLIST.pdf")
# print(rendered)

In [None]:
# from marker.converters.pdf import PdfConverter
# from marker.models import create_model_dict
# from marker.config.parser import ConfigParser

# config = {
#     "output_format": "markdown",
#     "ADDITIONAL_KEY": "VALUE"
# }
# config_parser = ConfigParser(config)

# converter = PdfConverter(
#     config=config_parser.generate_config_dict(),
#     artifact_dict=create_model_dict(),
#     processor_list=config_parser.get_processors(),
#     renderer=config_parser.get_renderer(),
#     llm_service=config_parser.get_llm_service()
# )
# rendered = converter("FUNCTIONBLOCKLIST.pdf")
# print(rendered)

In [None]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from marker.output import text_from_rendered

config = {
    "output_format": "markdown",
}
config_parser = ConfigParser(config)

converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    processor_list=config_parser.get_processors(),
    renderer=config_parser.get_renderer(),
    llm_service=config_parser.get_llm_service()
)

pdf_path = "analog.pdf"
rendered = converter(pdf_path)

# 一次接住三個回傳值
md_text, structures, images = text_from_rendered(rendered)

# 範例：寫入 .md
with open("analog.md", "w", encoding="utf-8") as f:
    f.write(md_text)

# 範例：檢視第二個回傳值（結構/中介資料）
print("Structures/blocks example:", type(structures), 
      getattr(structures, "__len__", lambda: "n/a")())

# 範例：處理第三個回傳值（影像資訊）
# 常見做法是逐一把圖片落地，並在 markdown 中引用
import os, base64

os.makedirs("FUNCTIONBLOCKLIST_assets", exist_ok=True)
if images:
    for idx, img in enumerate(images):
        # 依實際格式調整，常見欄位可能是 "bytes" 或 "data"（base64）
        # 這裡示範兩種處理分支
        out_path = os.path.join("FUNCTIONBLOCKLIST_assets", f"img_{idx+1}.png")
        if isinstance(img, dict):
            if "bytes" in img and isinstance(img["bytes"], (bytes, bytearray)):
                with open(out_path, "wb") as fp:
                    fp.write(img["bytes"])
            elif "data" in img and isinstance(img["data"], str):
                # base64 string -> bytes
                with open(out_path, "wb") as fp:
                    fp.write(base64.b64decode(img["data"]))
            else:
                # 如果結構不同，先印出確認
                print("Unknown image item keys:", img.keys())
        else:
            print("Unknown image item type:", type(img))

print("✅ 已取得三個回傳值並保存 Markdown；影像若存在也已輸出。")
