In [1]:
# from dotenv import load_dotenv
import dotenv
from PyPDF2 import PdfReader
import docx2txt
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain import PromptTemplate
import json
from openai import OpenAI

In [2]:
function_descriptions = [
    {
        "name": "scan_document",
        "description": "Scans a document and returns relevant information",
        "parameters": {
            "type": "object",
            "properties": {
                "supplier": {
                    "type": "string",
                    "description": "Name of the supplier"
                },
                "customer name": {
                    "type": "string",
                    "description": "Name of the customer"
                },
                "date": {
                    "type": "string",
                    "description": "Date of the document"
                },
                "ref.": {
                    "type": "string",
                    "description": "Refrence number of the document"
                },
                "item no.": {
                    "type": "string",
                    "description": "Item number of the document"
                },
                "risk factor": {
                    "type": "string",
                    "description": "Risk factor number of the document"
                },
                "model": {
                    "type": "string",
                    "description": "Model of the document"
                },
                "side specification": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "side": {
                                "type": "string",
                                "description": "side number"
                            },
                            "media": {
                                "type": "string",
                                "description": "Name of the media"
                            },
                            "sp. heat capacity": {
                                "type": "string",
                                "description": "Measure of the sp. heat capacity"
                            },
                            "inlet temp": {
                                "type": "string",
                                "description": "Measure of the inlet temperature"
                            },
                            "outlet temp": {
                                "type": "string",
                                "description": "Measure of the outlet temperature"
                            },
                            "flow rate": {
                                "type": "string",
                                "description": "Measure of the flow rate"
                            },
                            "liquid vol": {
                                "type": "string",
                                "description": "Measure of the liquid volume"
                            },
                        },
                    },
                    "description": "Heat specification",
                },
                "remarks": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "side": {
                                "type": "string",
                                "description": "Side number"
                            },
                            "test pressure": {
                                "type": "string",
                                "description": "Measure of the test pressure"
                            },
                            "design pressure": {
                                "type": "string",
                                "description": "Measure of the design pressure"
                            },
                            "max temperature": {
                                "type": "string",
                                "description": "Measure of the max temperature"
                            },
                            "min temperature": {
                                "type": "string",
                                "description": "Measure of the min temprature"
                            },
                            "net weight": {
                                "type": "string",
                                "description": "Measure of the net weight"
                            },
                            "weight with water": {
                                "type": "string",
                                "description": "Measure of the weight with water"
                            },
                            "heating surface": {
                                "type": "string",
                                "description": "Measure of the heating surface"
                            },
                            "cooling capacity": {
                                "type": "string",
                                "description": "Measure of the cooling capacity"
                            },
                            "heat tr. coeff": {
                                "type": "string",
                                "description": "Measure of the heat tr. coeff"
                            },
                        },
                    },
                    "description": "Additional remarks",
                },
            },
            "required": ["supplier", "customer name", "model"]
        }
    }
]

In [3]:
template = """/
Scan the following document and return the relevant details.
If the data is missing just return N/A
Document: {document}
"""

In [4]:
from dotenv import load_dotenv
load_dotenv('./.env')

True

In [6]:

load_dotenv()

llm = ChatOpenAI(model="gpt-4-0613")

# Console input for file path
file_path = input("he-specification.pdf")

# List to store the data
data_list = []

if file_path:
    print("Scanning document...")
    text = ""
    if file_path.endswith('.pdf'):
        with open(file_path, "rb") as file:
            pdf_reader = PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text()

    elif file_path.endswith('.docx'):
        text += docx2txt.process(file_path)

    prompt = PromptTemplate.from_template(template)
    content = prompt.format(document=text)

    response = llm.predict_messages(
        [HumanMessage(content=content)],
        functions=function_descriptions)

    data = json.loads(
        response.additional_kwargs["function_call"]["arguments"])
    
    # Append the data to the list
    data_list.append(data)

    print("## Details")
    print(f"Supplier: {data['supplier']}")  # Correct usage of get()
    print(f"Customer Name: {data['customer name']}")
    
    date = data.get('date', '')  # Provide an empty string as default if 'date' is not present
    print(f"Date: {data['date']}")

    # Correct the usage of get() method
    print(f"Ref: {data['ref.']}")
    print(f"Item No: {data['item no.']}")
    print(f"Risk Factor: {data['risk factor']}")
    print(f"Model: {data['model']}")
    print("Side Specification:")
    for spec in data.get('side specification', []):  # Using get() to handle the absence of 'side specification'
        print(f"""
            * Side: {spec['side']}
                - Media: {spec['media']}
                - Sp. Heat Capacity: {spec['sp. heat capacity']}
                - Inlet Temp: {spec['inlet temp']}
                - Outlet Temp: {spec['outlet temp']}
                - Flow Rate: {spec['flow rate']}
                - Liquid Vol: {spec['liquid vol']}
        """)
    print("Remarks:")
    for remark in data['remarks']:
        print(f"""
            * Side: {remark['side']}
                - Test Pressure: {remark['test pressure']}
                - Design Pressure: {remark['design pressure']}
                - Max Temperature: {remark['max temperature']}
                - Min Temperature: {remark['min temperature']}
                - Net Weight: {remark['net weight']}
                - Weight with Water: {remark['weight with water']}
                - Heating Surface: {remark['heating surface']}
                - Cooling Capacity: {remark['cooling capacity']}
                - Heat Tr. Coeff: {remark['heat tr. coeff']}
        """)
    # # Save the data to a file
    # with open('data.json', 'w') as f:
    #     json.dump(data, f)

    
    print("Document Scanned Successfully")


Scanning document...


  warn_deprecated(


## Details
Supplier: HE-SWHE Company
Customer Name: NOVIA RDI
Date: N/A
Ref: N/A
Item No: N/A
Risk Factor: N/A
Model: N/A
Side Specification:

            * Side: SIDE 1
                - Media: Water
                - Sp. Heat Capacity: 4190 J/kg/K
                - Inlet Temp: 68 °C
                - Outlet Temp: 89 °C
                - Flow Rate: 46 kg/s
                - Liquid Vol: 400 dm3
        

            * Side: SIDE 2
                - Media: Water
                - Sp. Heat Capacity: 4190 J/kg/K
                - Inlet Temp: 91 °C
                - Outlet Temp: 74 °C
                - Flow Rate: 56 kg/s
                - Liquid Vol: 400 dm3
        
Remarks:

            * Side: N/A
                - Test Pressure: 25 bar
                - Design Pressure: 20 bar
                - Max Temperature: 130 ° C
                - Min Temperature: 0 °C
                - Net Weight: 5800 kg
                - Weight with Water: 5000 kg
                - Heating Surface: 380 m2
    

In [19]:
import pandas as pd

# Extracting data for Invoice (Supplier) Detail Table
invoice_data = {
    'Supplier': [data_list[0]['supplier']],
    'Customer Name': [data_list[0]['customer name']],
    'Date': [data_list[0]['date']],
    'Invoice Reference': [data_list[0]['ref.']],
    'Item Number': [data_list[0]['item no.']],
    'Risk Factor': [data_list[0]['risk factor']],
    'Model': [data_list[0]['model']]
}
invoice_df = pd.DataFrame(invoice_data)

# Extracting data for Side Specification Table
side_spec_data = data_list[0]['side specification']
side_spec_df = pd.DataFrame(side_spec_data)
side_spec_df.insert(0, 'Invoice Reference', data_list[0]['ref.'])
side_spec_df.insert(0, 'Supplier', data_list[0]['supplier'])

# Extracting data for Additional Remark Table
remarks_data = data_list[0]['remarks']
remarks_df = pd.DataFrame(remarks_data)
remarks_df.insert(0, 'Invoice Reference', data_list[0]['ref.'])




In [20]:
invoice_df

Unnamed: 0,Supplier,Customer Name,Date,Invoice Reference,Item Number,Risk Factor,Model
0,HE-SWHE Company 1,NOVIA RDI,,,,,


In [21]:
side_spec_df

Unnamed: 0,Supplier,Invoice Reference,side,media,sp. heat capacity,inlet temp,outlet temp,flow rate,liquid vol
0,HE-SWHE Company 1,,SIDE 1,Water,4190 J/kg/K,68 °C,89 °C,46 kg/s,400 dm3
1,HE-SWHE Company 1,,SIDE 2,Water,4190 J/kg/K,91 °C,74 °C,56 kg/s,400 dm3


In [22]:
remarks_df

Unnamed: 0,Invoice Reference,side,test pressure,design pressure,max temperature,min temperature,net weight,weight with water,heating surface,cooling capacity,heat tr. coeff
0,,SIDE 1,25 bar,20 bar,130 ° C,0 °C,5000 kg,5800 kg,380 m2,4200 kW,2354 W/m2/K
1,,SIDE 2,25 bar,20 bar,130 ° C,0 °C,5000 kg,5800 kg,380 m2,4200 kW,2354 W/m2/K


# To make these tables searchable
There are several options depending on the scale of your data and the environment

1. Database System
If you have a large amount of data or expect frequent searches, using a database management system (DBMS) like MySQL, PostgreSQL, or SQLite would be ideal. Then these these can import tables into a database and then use SQL queries to search the data.

2. Pandas in Python
For smaller datasets or for quick analysis, you can use Pandas in Python, as the sample shown to create the tables. Pandas allows you to filter and search through dataframes easily. You can use various methods like df.query(), df.loc[], or df[df['column'] == value] to search for specific rows in your data.

3. Spreadsheet Software
Foe example Microsoft Excel or Google Sheets