In [1]:
from invoice_parser.utils import *
from collections import defaultdict
from invoice_parser.imports import *
from pydantic import BaseModel, Field
from langchain.chains import RetrievalQA
from langchain.document_loaders import PDFPlumberLoader
from langchain_ray.pdf.utils import pdf_to_docs, process_text
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

ModuleNotFoundError: No module named 'invoice_parser'

In [2]:
vline_settings = {
    "horizontal_strategy": "text",
    "vertical_strategy": "lines",
    "intersection_x_tolerance": 5,
    "snap_y_tolerance": 5,
    "join_x_tolerance": 5,
    "join_y_tolerance": 5,
}
hline_settings = {
    "horizontal_strategy": "lines",
    "vertical_strategy": "text",
    "intersection_x_tolerance": 5,
    "snap_y_tolerance": 5,
    "join_x_tolerance": 5,
    "join_y_tolerance": 5,
}
line_settings = {
    "horizontal_strategy": "lines",
    "vertical_strategy": "lines",
    "intersection_x_tolerance": 5,
    "snap_y_tolerance": 5,
    "join_x_tolerance": 5,
    "join_y_tolerance": 5,
}
text_settings = {
    "horizontal_strategy": "text",
    "vertical_strategy": "text",
    "intersection_x_tolerance": 5,
    "snap_y_tolerance": 5,
    "join_x_tolerance": 5,
    "join_y_tolerance": 5,
}
# text_settings = {
#     # "intersection_x_tolerance": 5,
#     # "snap_y_tolerance": 5,
#     # "join_x_tolerance": 5,
#     # "join_y_tolerance": 5,
#     "text_layout": True
# }


def get_fullest_row(table):
    rows = [r for r in table if full_row(r)]
    if len(rows) == 0:
        rows = table
    row = max(rows, key=len)
    return row, table.index(row)


def num_full_parts(row):
    return len([p for p in row if not empty_part(p)])


def get_table_items(table):
    if table is None or len(table) == 0:
        return []

    cols, cols_idx = get_fullest_row(table)
    for i, c in enumerate(cols):
        if empty_part(c):
            cols[i] = f"col_{i}"

    # let's assume that the first full row after the cols row is the first item
    first_order_row_idx = get_first_full_row(table[cols_idx + 1 :])[1]
    if first_order_row_idx is None:
        first_order_row_idx = get_first_non_empty_row(table[cols_idx + 1 :])[1]
    if first_order_row_idx is None:
        first_order_row_idx = 0
    first_order_row_idx += cols_idx + 1

    items = []
    item = {c: "" for c in cols}
    first_order_row_idx = min(first_order_row_idx, len(table) - 1)
    order_table = table[first_order_row_idx:]
    curr_row_len = num_full_parts(order_table[0])
    for row in order_table:
        if ((num_full_parts(row) == curr_row_len) or empty_row(row)) and len(item) > 0:
            items.append(item)
            item = {c: "" for c in cols}
            if not empty_row(row):
                curr_row_len = num_full_parts(row)
        for i, c in enumerate(cols):
            row_part = row[i]
            if not empty_part(row_part):
                row_part = " ".join(row[i].split("\n"))
                item[c] += row_part + " "
    items.append(item)
    return items


In [3]:
device = default_device()

embeddings = SentenceTransformerEmbeddings(
    model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={"device": default_device()}
)

llm = HuggingFacePipeline(
    pipeline=pipeline(
        "text2text-generation",
        model="google/flan-t5-large",
        device_map=default_device(),
        max_new_tokens=256,
    )
)

In [87]:
pdf = "/media/hamza/data2/wilson_tools/pdf/wt10.pdf"

data = pdfplumber.open(pdf)

img = data.pages[0].to_image()
docs = pdf_to_docs(pdf)[0]
text = [
    [t.strip() for t in p.extract_text(layout=True).splitlines()] for p in data.pages
]
vline_tables = [p.extract_table(table_settings=vline_settings) for p in data.pages]
hline_tables = [p.extract_table(table_settings=hline_settings) for p in data.pages]
line_tables = [p.extract_table(table_settings=line_settings) for p in data.pages]
text_tables = [p.extract_table(table_settings=text_settings) for p in data.pages]


In [85]:
tables = vline_tables
# tables = hline_tables
# tables = line_tables

In [86]:
dfs = [pd.DataFrame(get_table_items(t)) for t in tables]
dfs[0]

Unnamed: 0,Line ID,Item,Quantity,Unit Price,Amount
0,,,,,
1,1.0,"SO1962465-10P 13789 1-1/4 Thick HP2 Wilson ABS/WLS Custom Shape PUNCH BANANA TOOL.DXF, 12.5MM RAD INSIDE 20MM RAD OUT, 1962465",3.0,$160.01,$480.03
2,,,,,
3,2.0,"SO1962465-10D+.3 13789 1-1/4 Thick HP2 Wilson ABS/WLS Custom Shape DIE, BANANA TOOL.DXF, 12.5MM RAD INSIDE 20MM RAD OUT, SLUG =NEGATIVE/POSITIVE, CLR=0.300 MM, 1962465",2.0,$98.18,$196.36
4,,,,,
5,3.0,"SO1962465-10D+.5 13789 1-1/4 Thick HP2 Wilson ABS/WLS Custom Shape DIE, BANANA TOOL.DXF, 12.5MM RAD INSIDE 20MM RAD OUT, SLUG =NEGATIVE/POSITIVE, CLR=0.500 MM, 1962465",1.0,$98.18,$98.18


In [7]:
class ItemNER(BaseModel):
    material_description: str = Field(
        description="The material description of the order item.", default=None
    )
    quantity: str = Field(description="The quantity of the order item.", default=None)
    unit_price: float = Field(description="The unit price of the order item.", default=None)
    amount: float = Field(description="The amount of the order item.", default=None)


class OrderNER(BaseModel):
    order_items: List[ItemNER] = Field(description="The order items.", default=None)


class CustomerNER(BaseModel):
    order_number: int
    customer_number: int


order_parser = PydanticOutputParser(pydantic_object=CustomerNER)

order_temp = "Extract information from this text:\n{format_instructions}\n{text}\n"

order_prompt = PromptTemplate(
    template=order_temp,
    input_variables=["text"],
    partial_variables={"format_instructions": order_parser.get_format_instructions()},
)

order_temp = "Extract the total from this text: {text}\n"
order_prompt = PromptTemplate.from_template(order_temp)

In [8]:
# gen_chain = LLMChain(prompt=order_prompt, llm=llm)

In [14]:
docsearch = Chroma.from_documents(docs, embeddings)

In [42]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    # chain_type_kwargs={"prompt": prompt},
    return_source_documents=True,
)

In [47]:
query = "What is the total amount?"
query = "What is the order number?"
query = "What is the customer number?"
query = "What is the target shipping date?"
query = "Who is the customer?"
answer = qa({"query": query})
answer

Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


{'query': 'Who is the customer?',
 'result': 'Julie Skogen',
 'source_documents': [Document(page_content='identified. they may not be resold, transferred, or otherwise disposed of, to any other country or to any person other than the authorized ultimate consignee or end-user(s), either in', metadata={'page': 0, 'source': '/media/hamza/data2/wilson_tools/pdf/wt1.pdf', 'start_index': 1241}),
  Document(page_content='identified. they may not be resold, transferred, or otherwise disposed of, to any other country or to any person other than the authorized ultimate consignee or end-user(s), either in', metadata={'page': 1, 'source': '/media/hamza/data2/wilson_tools/pdf/wt1.pdf', 'start_index': 1967}),
  Document(page_content='identified. they may not be resold, transferred, or otherwise disposed of, to any other country or to any person other than the authorized ultimate consignee or end-user(s), either in', metadata={'page': 2, 'source': '/media/hamza/data2/wilson_tools/pdf/wt1.pdf', 'start

In [21]:
answer

'6,247.33'

In [10]:
res = gen_chain.run(dict(text=docs[-8]))
res


Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'6,247.33'