# Core

In [None]:
# | default_exp core


In [None]:
# | export

from invoice_parser.imports import *
from invoice_parser.utils import *


In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# | export


def page_to_order(
    page,
    table_settings={
        "horizontal_strategy": "text",
        "vertical_strategy": "lines",
        "intersection_x_tolerance": 5,
        "snap_y_tolerance": 5,
    },
):
    table = page.extract_table(table_settings=table_settings)
    cols_id = first_empty_row(table) + 1
    cols = table[cols_id]
    cols_dict = {c: [] for c in cols}
    cols_id += 2
    comments = []
    for info_id, row in enumerate(table[cols_id:]):
        # if empty_row(row) or full_row(row, cols):
        if empty_row(row) or full_row(row):
            break
        elif empty_part(row[0]) and not empty_part(row[1]):
            comments.append(row[1])
    info_id += cols_id
    lines = []
    for row in table[info_id:]:
        if empty_row(row) and len(lines) > 0:
            cols_dict[cols[1]].append(", ".join(lines))
            lines = []
        else:
            if not empty_part(row[0]):
                cols_dict[cols[0]].append(row[0])
            if not empty_part(row[1]):
                lines.append(row[1])
            for r, c in zip(row[2:], cols[2:]):
                if not empty_part(r):
                    cols_dict[c].append(r)
    if len(lines) > 0:
        cols_dict[cols[1]].append(", ".join(lines))
    cd0 = cols_dict[cols[0]]
    cd_1 = cols_dict[cols[-1]]
    cd_2 = cols_dict[cols[-2]]
    if len(cd_1) > len(cd0) and len(cd_2) > len(cd0):
        total = float(cd_1[-1].replace("$", "").replace(",", "").strip())
        cols_dict[cols[-1]] = cd_1[: len(cd0)]
        cols_dict[cols[-2]] = cd_2[: len(cd0)]
    else:
        total = 0
    return pd.DataFrame(cols_dict), ", ".join(comments), total


def pdf_to_order(pdf):
    pdf = pdfplumber.open(pdf)
    dfs = []
    comments_ = []
    totals = []
    for page in pdf.pages:
        df, comments, total = page_to_order(page)
        dfs.append(df)
        comments_.append(comments)
        totals.append(total)
    order = pd.concat(dfs).reset_index(drop=True)
    comments = comments_[0]
    total = totals[-1]
    return order, comments, total


def page_to_extra_info(page):
    text = page.extract_text().splitlines()
    ik2 = ["Purchase Order No.", "Incoterms", "Payment Terms", "Method of Shipping"]
    ik3 = ["Target Ship Date", "Ship To", "Currency"]
    info_dict = {}
    for k2 in ik2:
        for t in text:
            if t.startswith(k2):
                t = t.replace(k2, "").strip()
                for k3 in ik3:
                    idx = t.find(k3)
                    if idx != -1:
                        info_dict[k2] = t[:idx]
                        info_dict[k3] = t[idx:].replace(k3, "").replace(":", "").strip()
                        break
                    else:
                        info_dict[k2] = t
                break
    return info_dict


def pdf_to_info(pdf):
    page = pdfplumber.open(pdf).pages[0]
    tables = page.extract_tables()
    info_table = tables[0][1][0].split("\n")
    address_table = tables[1][1][0].split("\n")
    info_keys = [
        "Order Number",
        "Revision",
        "Document Date",
        "Customer No.",
        "Quotation",
        "Questions/Contact",
        "Email",
        "Requested by",
    ]
    info_dict = {k: t.replace(k, "").strip() for k, t in zip(info_keys, info_table)}
    address_dict = {
        "Address": ", ".join(address_table[:-2]),
        "Tel": address_table[-2].replace("Tel#", "").strip(),
        "Fax": address_table[-1].replace("Fax#", "").strip(),
    }
    info_dict.update(address_dict)
    extra_info = page_to_extra_info(page)
    info_dict.update(extra_info)
    info_dict = {k: [v] for k, v in info_dict.items()}
    return pd.DataFrame(info_dict)


def pdf_to_dfs(pdf):
    info_dfs = []
    order_dfs = []
    for pdf in resolve_data_path(pdf):
        if Path(pdf).suffix == ".pdf":
            order_info = pdf_to_info(pdf)
            order, comments, total = pdf_to_order(pdf)
            order_info = pdf_to_info(pdf)
            order["Order Number"] = order_info["Order Number"][0]
            order_info["Comments"] = comments
            order_info["Total"] = total
            info_dfs.append(pd.DataFrame(order_info))
            order_dfs.append(
                pd.DataFrame(order, columns=["Order Number"] + list(order.keys())[:-1])
            )
    info_df = pd.concat(info_dfs).reset_index(drop=True)
    order_df = pd.concat(order_dfs).reset_index(drop=True)
    return info_df, order_df


def load_csv_chain(input_variables=["csv"], output_variables=["csv_data"], verbose=False):
    return transform_chain(
        load_csv,
        vars_kwargs_mapping={input_variables[0]: "path"},
        input_variables=input_variables,
        output_variables=output_variables,
        verbose=verbose,
    )


def pdf_to_dfs_chain(
    input_variables=["pdf"], output_variables=["info_df", "order_df"], verbose=False
):
    return transform_chain(
        pdf_to_dfs,
        vars_kwargs_mapping={input_variables[0]: "pdf"},
        input_variables=input_variables,
        output_variables=output_variables,
        verbose=verbose,
    )

In [None]:
# | hide
# | eval: false

pdf_chain = pdf_to_dfs_chain(
    input_variables=["pdf"], output_variables=["info_df", "order_df"], verbose=False
)

In [None]:
# | hide
# | eval: false

pdf = "/media/hamza/data2/wt1.pdf"

pdf_dfs = pdf_chain(dict(pdf=pdf))
info_df = pdf_dfs["info_df"]
order_df = pdf_dfs["order_df"]


In [None]:
# | hide
# | eval: false

info_df


Unnamed: 0,Order Number,Revision,Document Date,Customer No.,Quotation,Questions/Contact,Email,Requested by,Address,Tel,Fax,Purchase Order No.,Target Ship Date,Incoterms,Ship To,Payment Terms,Currency,Method of Shipping,Comments,Total
0,2024916,0,05/31/2023,1013683,21640223,Julie Skogen,julie.skogen@wilsontool.com,Joey Perkins,"Anchor Fabrication, 141 Bain Street, LA VERGNE TN 37086",615-793-3000,615-793-3381,114969,06/13/2023,FOB White Bear Lake,1003101,Net due in 30 days,USD,UPS Ground,"Tool quote for NC1832, Thank You for the order!, Julie Skogen, 800-328-9646 ext 7165, Julies@wilsontool.com",6247.33


In [None]:
# | hide
# | eval: false

order_df


Unnamed: 0,Order Number,Item,Material Description,Quantity,Unit Price,Amount
0,2024916,10,"25026 Trumpf Size 1 Flat Punch Shape, 2.36mm-30.00mm, SQ 0.4370 IN, FLAT,",2EA,97.65,195.3
1,2024916,20,"26740 Trumpf Size 1 Cupped GL Die Shape, SQ 0.4370 IN 0.0250 IN,CLEARANCE, SHAPE 1=90.0000 DEG,",2EA,99.68,199.36
2,2024916,30,25179 Coiled Spring Pin Ø3mm x 16mm Long,100 EA,2.25,225.0
3,2024916,40,"26727 Trumpf Size 2 GL Die Round, RD 1.7500 IN 0.0120 IN,CLEARANCE, ULTIMA PREMIUM DIE STEEL",2EA,112.05,224.1
4,2024916,50,"25781 Trumpf 241 Whisper Punch Round, 40.01mm-56.00mm, RD 1.7500 IN, ROOF TOP SHEAR,, ULTIMA PREMIUM PUNCH STEEL",1EA,108.22,108.22
5,2024916,60,"26727 Trumpf Size 2 GL Die Round, RD 1.3850 IN 0.0120 IN,CLEARANCE, ULTIMA PREMIUM DIE STEEL",2EA,112.05,224.1
6,2024916,70,"26739 Trumpf Size 1 Cupped GL Die Round, RD 0.3000 IN 0.0350 IN,CLEARANCE",2EA,52.88,105.76
7,2024916,80,"25004 Trumpf Size 1 Flat Punch Round, 2.36mm-30.00mm, RD 0.3000 IN, FLAT,",2EA,58.5,117.0
8,2024916,90,25972 Trumpf MT10 Gear Wheel,2EA,746.79,1493.58
9,2024916,100,25969 Trumpf MT10 Stripper With Ø11.70mm Holes,3EA,125.1,375.3


In [None]:
# | hide
import nbdev

nbdev.nbdev_export()
