### Hello Demo

In [None]:
# # 1. Install the marker library
# !pip install marker-pdf -q

# # 2. Import necessary components
# from marker.converters.pdf import PdfConverter
# from marker.models import create_model_dict
# from marker.output import text_from_rendered
# from pathlib import Path

# # 3. Define the path to your uploaded PDF
# pdf_path = Path("/content/2Q25_CFO_presentation.pdf")

# # 4. (NEW) Automatically create the folder structure based on the PDF name
# # This creates a main folder like "/content/2Q25_CFO_presentation/"
# main_output_folder = Path(pdf_path.stem)
# # This creates a subfolder like "/content/2Q25_CFO_presentation/images/"
# image_subfolder = main_output_folder / "images"

# main_output_folder.mkdir(exist_ok=True)
# image_subfolder.mkdir(exist_ok=True)

# # 5. Set up and run the converter
# try:
#     converter = PdfConverter(
#         artifact_dict=create_model_dict(),
#     )

#     print("Converting PDF... (This may take a moment)")
#     rendered = converter(str(pdf_path)) # Convert path object to string for the converter
#     text, _, images = text_from_rendered(rendered)

#     # 6. (UPDATED) Save the markdown file inside the main folder
#     output_md_path = main_output_folder / "output.md"
#     with open(output_md_path, "w", encoding="utf-8") as f:
#         f.write(rendered.markdown)
#     print(f"\nMarkdown content saved to '{output_md_path}'")

#     # 7. (UPDATED) Save images into the new subfolder
#     print(f"Found {len(images)} images. Saving them into '{image_subfolder}'...")
#     for img_filename, img_object in images.items():
#         # Prepend the subfolder path to the filename
#         save_path = image_subfolder / img_filename
#         img_object.save(save_path, format="PNG")
#     print("✅ All images have been saved successfully!")

# except FileNotFoundError:
#     print(f"❌ ERROR: The file was not found at '{pdf_path}'.")
# except Exception as e:
#     print(f"An unexpected error occurred: {e}")

In [2]:
# 1. Install the marker library
# This command should be run in your terminal or a Colab cell:
# !pip install marker-pdf -q

# 2. Import necessary components
import json
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from pathlib import Path

# 3. Define the path to your uploaded PDF
# Make sure to upload your PDF file to the environment if you are using a cloud notebook.
# For example, in Google Colab, upload it to the "/content/" directory.
pdf_path = Path("/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation.pdf")

# Check if the PDF file exists before proceeding
if not pdf_path.exists():
    print(f"❌ ERROR: The file was not found at '{pdf_path}'.")
    print("Please make sure you have uploaded the PDF file to the correct path.")
else:
    # 4. Automatically create the folder structure based on the PDF name
    # This creates a main folder like "/content/2Q25_CFO_presentation/"
    main_output_folder = Path(pdf_path.stem)
    # This creates a subfolder like "/content/2Q25_CFO_presentation/images/"
    image_subfolder = main_output_folder / "images"

    main_output_folder.mkdir(exist_ok=True)
    image_subfolder.mkdir(exist_ok=True)

    # 5. Set up and run the converter
    try:
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )

        print(f"Converting '{pdf_path.name}'... (This may take a moment)")
        rendered = converter(str(pdf_path)) # Convert path object to string for the converter

        # The second return value from text_from_rendered is the structured metadata
        text, doc_metadata, images = text_from_rendered(rendered)

        # 6. (UPDATED) Save the metadata as a JSON file inside the main folder
        output_json_path = main_output_folder / "output.json"
        with open(output_json_path, "w", encoding="utf-8") as f:
            # Use indent=4 for a human-readable, pretty-printed JSON file
            json.dump(doc_metadata, f, indent=4, ensure_ascii=False)
        print(f"\n✅ JSON metadata saved to '{output_json_path}'")

        # 7. (UPDATED) Save images into the new subfolder
        if images:
            print(f"Found {len(images)} images. Saving them into '{image_subfolder}'...")
            for img_filename, img_object in images.items():
                # Prepend the subfolder path to the filename
                save_path = image_subfolder / img_filename
                img_object.save(save_path, format="PNG")
            print("✅ All images have been saved successfully!")
        else:
            print("No images found in the document.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")




Converting '2Q25_CFO_presentation.pdf'... (This may take a moment)


Recognizing Layout: 100%|██████████| 29/29 [02:31<00:00,  5.23s/it]
Running OCR Error Detection: 100%|██████████| 8/8 [00:01<00:00,  7.66it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  2.00s/it]
Recognizing Text: 100%|██████████| 4/4 [00:19<00:00,  4.87s/it]
Recognizing tables: 100%|██████████| 4/4 [00:13<00:00,  3.42s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.34it/s]
Recognizing Text: 100%|██████████| 60/60 [00:24<00:00,  2.48it/s]



✅ JSON metadata saved to '2Q25_CFO_presentation/output.json'
Found 38 images. Saving them into '2Q25_CFO_presentation/images'...
✅ All images have been saved successfully!


In [4]:
# 1. Install the marker library
# This command should be run in your terminal or a Colab cell:
# !pip install marker-pdf -q

# 2. Import necessary components
import json
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from pathlib import Path

# 3. Define the path to your uploaded PDF
# Make sure to upload your PDF file to the environment if you are using a cloud notebook.
# For example, in Google Colab, upload it to the "/content/" directory.
pdf_path = Path("/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation.pdf")
# Check if the PDF file exists before proceeding
if not pdf_path.exists():
    print(f"❌ ERROR: The file was not found at '{pdf_path}'.")
    print("Please make sure you have uploaded the PDF file to the correct path.")
else:
    # 4. (FIXED) Automatically create the folder structure based on the PDF name
    # This creates an absolute path to a folder in the same directory as the PDF
    # e.g., "/content/2Q25_CFO_presentation/"
    main_output_folder = pdf_path.parent / pdf_path.stem
    # This creates a subfolder like "/content/2Q25_CFO_presentation/images/"
    image_subfolder = main_output_folder / "images"

    main_output_folder.mkdir(exist_ok=True)
    image_subfolder.mkdir(exist_ok=True)

    # 5. Set up and run the converter
    try:
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )

        print(f"Converting '{pdf_path.name}'... (This may take a moment)")
        rendered = converter(str(pdf_path)) # Convert path object to string for the converter

        # The second return value from text_from_rendered is the structured metadata
        text, doc_metadata, images = text_from_rendered(rendered)

        # 6. (UPDATED) Save both Markdown and JSON files

        # Save the markdown content to a .md file
        output_md_path = main_output_folder / "output.md"
        with open(output_md_path, "w", encoding="utf-8") as f:
            f.write(rendered.markdown)
        print(f"\n✅ Markdown content saved to '{output_md_path}'")

        # Save the metadata as a JSON file
        output_json_path = main_output_folder / "output.json"
        with open(output_json_path, "w", encoding="utf-8") as f:
            # Use indent=4 for a human-readable, pretty-printed JSON file
            json.dump(doc_metadata, f, indent=4, ensure_ascii=False)
        print(f"✅ JSON metadata saved to '{output_json_path}'")

        # 7. (UPDATED) Save images into the new subfolder
        if images:
            print(f"\nFound {len(images)} images. Saving them into '{image_subfolder}'...")
            for img_filename, img_object in images.items():
                # Prepend the subfolder path to the filename
                save_path = image_subfolder / img_filename
                img_object.save(save_path, format="PNG")
            print("✅ All images have been saved successfully!")
        else:
            print("\nNo images found in the document.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")





Converting '2Q25_CFO_presentation.pdf'... (This may take a moment)


Recognizing Layout: 100%|██████████| 29/29 [02:22<00:00,  4.91s/it]
Running OCR Error Detection: 100%|██████████| 8/8 [00:00<00:00, 25.08it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
Recognizing Text: 100%|██████████| 4/4 [00:16<00:00,  4.13s/it]
Recognizing tables: 100%|██████████| 4/4 [00:13<00:00,  3.37s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
Recognizing Text: 100%|██████████| 60/60 [00:20<00:00,  2.95it/s]



✅ Markdown content saved to '/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation/output.md'
✅ JSON metadata saved to '/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation/output.json'

Found 38 images. Saving them into '/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation/images'...
✅ All images have been saved successfully!


In [6]:
# 1. Install the marker library
# This command should be run in your terminal or a Colab cell:
# !pip install marker-pdf -q

# 2. Import necessary components
import json
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from pathlib import Path

# 3. Define the path to your uploaded PDF
# Make sure to upload your PDF file to the environment if you are using a cloud notebook.
# For example, in Google Colab, upload it to the "/content/" directory.
pdf_path = Path("/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation.pdf")

# Check if the PDF file exists before proceeding
if not pdf_path.exists():
    print(f"❌ ERROR: The file was not found at '{pdf_path}'.")
    print("Please make sure you have uploaded the PDF file to the correct path.")
else:
    # 4. Automatically create the folder structure based on the PDF name
    main_output_folder = pdf_path.parent / pdf_path.stem
    image_subfolder = main_output_folder / "images"

    main_output_folder.mkdir(exist_ok=True)
    image_subfolder.mkdir(exist_ok=True)

    # 5. Set up and run the converter
    try:
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )

        print(f"Converting '{pdf_path.name}'... (This may take a moment)")
        # Process the PDF once to get a single "rendered" object
        rendered = converter(str(pdf_path))

        # 6. Extract all required outputs from the single rendered object
        # text_from_rendered is the helper function to get text, metadata (for JSON), and images
        text, doc_metadata, images = text_from_rendered(rendered)

        # 7. Save the markdown and JSON files
        # The markdown content is a direct property of the rendered object
        output_md_path = main_output_folder / "output.md"
        with open(output_md_path, "w", encoding="utf-8") as f:
            f.write(rendered.markdown)
        print(f"\n✅ Markdown content saved to '{output_md_path}'")

        # The metadata is saved as our JSON file
        output_json_path = main_output_folder / "output.json"
        with open(output_json_path, "w", encoding="utf-8") as f:
            json.dump(doc_metadata, f, indent=4, ensure_ascii=False)
        print(f"✅ JSON metadata saved to '{output_json_path}'")

        # 8. Save images extracted earlier
        if images:
            print(f"\nFound {len(images)} images. Saving them into '{image_subfolder}'...")
            for img_filename, img_object in images.items():
                save_path = image_subfolder / img_filename
                img_object.save(save_path, format="PNG")
            print("✅ All images have been saved successfully!")
        else:
            print("\nNo images found in the document.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")





Converting '2Q25_CFO_presentation.pdf'... (This may take a moment)


Recognizing Layout: 100%|██████████| 29/29 [02:26<00:00,  5.04s/it]
Running OCR Error Detection: 100%|██████████| 8/8 [00:00<00:00, 17.80it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.90it/s]
Recognizing Text: 100%|██████████| 4/4 [00:16<00:00,  4.08s/it]
Recognizing tables: 100%|██████████| 4/4 [00:14<00:00,  3.74s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.86s/it]
Recognizing Text: 100%|██████████| 60/60 [00:19<00:00,  3.00it/s]



✅ Markdown content saved to '/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation/output.md'
✅ JSON metadata saved to '/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation/output.json'

Found 38 images. Saving them into '/Users/marcusfoo/Documents/GitHub/PTO_ICT3113_Grp1/All/2Q25_CFO_presentation/images'...
✅ All images have been saved successfully!
