In [1]:
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

True

In [2]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")

In [40]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

In [41]:
# # Load documents

# loader = WebBaseLoader(
#     web_paths = ("https://lilianweng.github.io/posts/2023-06-23-agent/",),
#     bs_kwargs = dict(
#         parse_only = bs4.SoupStrainer(
#             class_=("post-content","post-title","post-header")
#         )
#     ),
# )
# docs = loader.load()

In [None]:
with open (r"C:\Projects\Langchain\Rag_doc\data.txt",'r') as f:
    content = f.read()

print(content)
docs = content


In [44]:
# split - chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
splits = text_splitter.split_text(docs)

In [82]:
splits[0]

"Invoice Text: Invoice No. 79958 issued by Digital Works on 2024-10-01. Amount Due: $2813. Part: Circuit Board, Received: 2024-10-02. Buyer: ID 108, Olivia Green.\nAnnotation: {'invoice_number': '79958', 'vendor_name': 'Digital Works', 'date': '2024-10-01', 'total_amount': '$2813', 'part_description': 'Circuit Board', 'date_of_reception': '2024-10-02', 'buyer_id': '108', 'buyer_name': 'Olivia Green'}\nInvoice Text: Invoice No. 3649 issued by ABC Enterprises on 2024-10-02. Amount Due: Â£547. Part: Hydraulic Pump, Received: 2024-10-05. Buyer: ID 106, Emma Davis.\nAnnotation: {'invoice_number': '3649', 'vendor_name': 'ABC Enterprises', 'date': '2024-10-02', 'total_amount': 'Â£547', 'part_description': 'Hydraulic Pump', 'date_of_reception': '2024-10-05', 'buyer_id': '106', 'buyer_name': 'Emma Davis'}\nInvoice Text: Invoice No. 24580 issued by ABC Services on 2024-10-08. Amount Due: â‚¬4125. Part: Bearing Set, Received: 2024-10-09. Buyer: ID 110, Sophia Harris."

In [45]:
# Embed
model_name = "BAAI/bge-small-en"
model_kwargs = {"device":"cpu"}
encode_kwargs = {"normalize_embeddings" : True}
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name = model_name,model_kwargs = model_kwargs,encode_kwargs =encode_kwargs
)

In [88]:
len(splits)

50

In [87]:
import numpy as np
embeddings = hf_embeddings.embed_documents(splits)
embeddings_np = np.array(embeddings).astype('float32')
embeddings_np.shape

(50, 384)

In [49]:
vectorstore = FAISS.from_texts(
    texts = splits,
    embedding = hf_embeddings
)
retriever = vectorstore.as_retriever()

In [58]:
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="You are an expert in extracting entities from invoices. Try extracting entities from the input text with the help of sample annotation given Input invoice text = Invoice Text: Invoice No. 30001 issued by Harish Ltd. on 2024-10-07. Amount Due: $25536. Part: LED Display, Received: 2024-10-03. Buyer: ID 103, Prasanth. Use the annotations only as samples to help in the process of extracting entities on given invoice. Sample annotations: {context}, answer the question: {question}"
)

# prompt = """You are an expert in document extration. Answer the questions with relevant information by refering the context provided
# {context}
# Question :{question}"""

In [64]:
llm = ChatGroq(model = "llama3-8b-8192",temperature =0) # 8B parameters and 8192 input tokens

# Post-processing

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

#chain - Method 1
rag_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = retriever ,
    return_source_documents = True,
    chain_type_kwargs = {"prompt" : prompt_template }
    
)
result = rag_chain({"query": "What is the name of the buyer in the given invoice?"})
print(len(result["source_documents"]))
for i,doc in enumerate(result["source_documents"]):
    print(f"Text {i}: {doc.page_content}")

4
Text 0: Invoice Text: Invoice No. 20230 issued by Quick Solutions on 2024-10-07. Amount Due: $1622. Part: Widget A, Received: 2024-10-08. Buyer: ID 109, Henry White.
Annotation: {'invoice_number': '20230', 'vendor_name': 'Quick Solutions', 'date': '2024-10-07', 'total_amount': '$1622', 'part_description': 'Widget A', 'date_of_reception': '2024-10-08', 'buyer_id': '109', 'buyer_name': 'Henry White'}
Invoice Text: Invoice No. 12589 issued by Fournisseur Global on 2024-10-07. Amount Due: Â£4749. Part: Control Valve, Received: 2024-10-04. Buyer: ID 105, David Wilson.
Annotation: {'invoice_number': '12589', 'vendor_name': 'Fournisseur Global', 'date': '2024-10-07', 'total_amount': 'Â£4749', 'part_description': 'Control Valve', 'date_of_reception': '2024-10-04', 'buyer_id': '105', 'buyer_name': 'David Wilson'}
Invoice Text: Invoice No. 48249 issued by New Horizons LLC on 2024-10-02. Amount Due: Â£811. Part: Widget A, Received: 2024-10-03. Buyer: ID 105, David Wilson.
Text 1: Invoice Text: 

In [60]:
# Chain
rag_chain = (
    {"context": retriever , "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

# Question
print(rag_chain.invoke("What is the name of the buyer in the given invoice?"))

Based on the input invoice text:

"Invoice Text: Invoice No. 30001 issued by Harish Ltd. on 2024-10-07. Amount Due: $25536. Part: LED Display, Received: 2024-10-03. Buyer: ID 103, Prasanth."

Using the sample annotations as a guide, I can extract the following entities:

* Invoice Number: 30001
* Vendor Name: Harish Ltd.
* Date: 2024-10-07
* Total Amount: $25536
* Part Description: LED Display
* Date of Reception: 2024-10-03
* Buyer ID: 103
* Buyer Name: Prasanth

Therefore, the name of the buyer in the given invoice is Prasanth.


In [65]:
# Query Translation
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_template = PromptTemplate.from_template(template)

generate_queries = (
    prompt_template
    |ChatGroq(temperature = 0)
    |StrOutputParser()
    |(lambda x: x.split("\n"))
)

In [66]:
generate_queries.invoke("extract buyer details from the given invoice text")

['1. "Can you identify the buyer\'s information from the provided invoice document?"',
 '2. "Could you pull out the buyer\'s details mentioned in the given invoice text?"',
 '3. "Is it possible to extract the buyer\'s particulars from the text of the invoice?"',
 '4. "I need to retrieve the buyer\'s details from the invoice text - can you help?"',
 '5. "Search the invoice text to find and provide the buyer\'s details, please."']

In [67]:
from langchain.load import dumps,loads
def get_unique_union(documents):
    unique_docs = {}

    for sublist in documents:
        for doc in sublist:
            doc_str = dumps(doc)

            if doc_str not in unique_docs:
                unique_docs[doc_str] = doc
    return list(unique_docs.values())


In [79]:
# Retrieve
question = "What is the buyer id in the given invoice?"
retrieval_chain = (
    generate_queries
    |retriever.map()
    |get_unique_union
)
docs = retrieval_chain.invoke({"question":question})
len(docs)

11

In [80]:
llm = ChatGroq(temperature = 0)

In [81]:
from operator import itemgetter
template = """You are an expert in extracting entities from invoices. Try extracting entities from the input text with the help of sample annotation given Input invoice text = Invoice Text: Invoice No. 30001 issued by Harish Ltd. on 2024-10-07. Total: $25536. Part: LED Display, Received: 2024-10-03. Identification number of buyer 103, Prasanth. Use the annotations only as samples to help in the process of extracting entities on given invoice. Sample annotations: {context}, answer the question: {question}
Extract only the requested text and don't extract anyother data"""
prompt_template = PromptTemplate.from_template(template)

final_rag_chain = (
    {"context" : retrieval_chain,"question":itemgetter("question")}
    |prompt_template
    |llm
    |StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'The buyer id in the given invoice is 103.'