In [1]:
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import Document
from pathlib import Path
from pydantic import BaseModel, Field
from uuid import uuid4
from typing import Optional

In [2]:
DATA_PATH = Path().absolute().parent.joinpath("data")
CONTRACTS_PATH = DATA_PATH.joinpath("raw", "contracts")
FILE_PATH = CONTRACTS_PATH.joinpath("CW0348.pdf")
SILVER_PATH = DATA_PATH.joinpath("silver", "contracts")

In [3]:
loader = PyMuPDFReader()
documents = loader.load(file_path=FILE_PATH)

In [4]:
print(documents[0])

Doc ID: c00051c1-f1eb-4bd2-b176-24778fd25228
Text: PROCUREMENT CONTRACT   This Procurement Contract (the
"Contract") is entered into between C11, hereinafter referred to  as
the "Supplier," and Plasma Corporation, hereinafter referred to as the
"Buyer."   1. TERM   1.1 Effective Date: This Contract shall become
effective on November 2020.   1.2 Expiry Date: The initial term of
this Contract shal...


# Working with LLamaIndex PyMuPDF Reader

In [5]:
documents[0].metadata

{'total_pages': 2,
 'file_path': '/Users/datapsycho/PythonProjects/procure.me/data/raw/contracts/CW0348.pdf',
 'source': '1'}

In [6]:
documents[0].text

' \n PROCUREMENT CONTRACT  \nThis Procurement Contract (the "Contract") is entered into between C11, hereinafter referred to \nas the "Supplier," and Plasma Corporation, hereinafter referred to as the "Buyer."  \n1. TERM  \n1.1 Effective Date: This Contract shall become effective on November 2020.  \n1.2 Expiry Date: The initial term of this Contract shall be for a period of 5 years from the \nEffective Date unless terminated earlier as per the terms of this Contract.  \n2. SUPPLIER DETAILS  \nSupplier Name: C11 Address: 456 Oak Avenue, Cityville, USA Contact Person: Jane Doe \nEmail: jane@xyzsuppliers.com Phone: +1-987-654-3210  \n3. PURCHASE DETAILS  \n3.1 Product/Service Description: The Supplier shall provide the following products/services to \nthe Buyer:  \nProduct X011 \nProduct Y011 \nProduct Z011 \n \n3.2 Volume Discounts: The Buyer and the Supplier agree to the following volume discounts \nbased on the cumulative purchase volume during the Contract term:  \n• \n• 100 units - 

In [7]:
class ParsedDocumentParts(BaseModel):
    id_: str
    text: str
    part: str

In [8]:
class ParsedDocument(BaseModel):
    id_: str = Field(default_factory=lambda: str(uuid4()))
    total_pages: int
    file_name: str
    text: str
    parts: list[ParsedDocumentParts]

In [None]:
class DocumentParser:
    """Parse a document into a list of documents. But also preserved the metadata and orginal document as markdown."""
    def __init__(self, file_path: Path):
        self.file_path = file_path
        self.parsed_document = Optional[Document]
        self.write_file_name = f"{file_path.name}.json"

    def parse_to_markdown(self) -> list[Document]:
        loader = PyMuPDFReader()
        documents = loader.load(file_path=self.file_path)
        return documents

    def parse(self) -> ParsedDocument:
        documents = self.parse_to_markdown()
        total_page = len(documents)
        file_name = documents[0].metadata["file_path"].split("/")[-1]
        text = []
        parts = []
        for item in documents:
            page_number = item.metadata["source"]
            part = ParsedDocumentParts(id_=item.id_, text=item.text, part="part - " + str(page_number))
            parts.append(part)
            text.append(item.text + "\n" + f"page - [{page_number}]\n")
        parsed_document = ParsedDocument(total_pages=total_page, file_name=file_name, text="".join(text), parts=parts)
        self.ir = parsed_document
        return self.ir

    def save(self, file_path: Path):
        if self.ir is None:
            raise ValueError("Document is not parsed yet.")
        write_path = file_path.joinpath(self.write_file_name)
        with open(write_path, "w") as f:
            json_data = self.ir.model_dump_json(indent=4)
            f.write(json_data)
            print(f"Document saved to {file_path}")
        

In [10]:
parser = DocumentParser(file_path=FILE_PATH)
doc = parser.parse()
print(doc)

id_='7bf76f68-1b9f-4d04-914d-859b8f3ee0fc' total_pages=2 file_name='CW0348.pdf' text=' \n PROCUREMENT CONTRACT  \nThis Procurement Contract (the "Contract") is entered into between C11, hereinafter referred to \nas the "Supplier," and Plasma Corporation, hereinafter referred to as the "Buyer."  \n1. TERM  \n1.1 Effective Date: This Contract shall become effective on November 2020.  \n1.2 Expiry Date: The initial term of this Contract shall be for a period of 5 years from the \nEffective Date unless terminated earlier as per the terms of this Contract.  \n2. SUPPLIER DETAILS  \nSupplier Name: C11 Address: 456 Oak Avenue, Cityville, USA Contact Person: Jane Doe \nEmail: jane@xyzsuppliers.com Phone: +1-987-654-3210  \n3. PURCHASE DETAILS  \n3.1 Product/Service Description: The Supplier shall provide the following products/services to \nthe Buyer:  \nProduct X011 \nProduct Y011 \nProduct Z011 \n \n3.2 Volume Discounts: The Buyer and the Supplier agree to the following volume discounts \nba

In [12]:
parser.ir

ParsedDocument(id_='7bf76f68-1b9f-4d04-914d-859b8f3ee0fc', total_pages=2, file_name='CW0348.pdf', text=' \n PROCUREMENT CONTRACT  \nThis Procurement Contract (the "Contract") is entered into between C11, hereinafter referred to \nas the "Supplier," and Plasma Corporation, hereinafter referred to as the "Buyer."  \n1. TERM  \n1.1 Effective Date: This Contract shall become effective on November 2020.  \n1.2 Expiry Date: The initial term of this Contract shall be for a period of 5 years from the \nEffective Date unless terminated earlier as per the terms of this Contract.  \n2. SUPPLIER DETAILS  \nSupplier Name: C11 Address: 456 Oak Avenue, Cityville, USA Contact Person: Jane Doe \nEmail: jane@xyzsuppliers.com Phone: +1-987-654-3210  \n3. PURCHASE DETAILS  \n3.1 Product/Service Description: The Supplier shall provide the following products/services to \nthe Buyer:  \nProduct X011 \nProduct Y011 \nProduct Z011 \n \n3.2 Volume Discounts: The Buyer and the Supplier agree to the following vol

In [13]:
parser.save(file_path=SILVER_PATH)

Document saved to /Users/datapsycho/PythonProjects/procure.me/data/silver/contracts
