In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [2]:
doc_path = "../data/BOI.pdf"
model = "llama3"

# Local PDF file uploads
if doc_path:
    loader = UnstructuredPDFLoader(file_path=doc_path)
    print("Started loading....")
    data = loader.load()
    print("done loading....")
else:
    print("Upload a PDF file")

content = data[0].page_content
print(content)


Started loading....


  from .autonotebook import tqdm as notebook_tqdm


done loading....
Beneficial Ownership Information Report

Filing Instructions

Financial Crimes Enforcement Network

U.S. Department of the Treasury

Version 1.0 January 2024

Table of Contents

I.	 Who,	What,	When	of	Beneficial	Ownership	Information	Reporting	Requirements	........................3

II.	 Where	to	Report	Beneficial	Ownership	Information	............................................................................5

III.	How	to	Report	Beneficial	Ownership	Information	................................................................................6

a.	Recommendations	for	Successful	Filings	.......................................................................................6

b.	Item	Instructions	................................................................................................................................10

Disclaimer:	These	filing	instructions	are	explanatory	only	and	do	not	supplement	or	modify	any	 obligations	imposed	by	statute	or	regulation.		Fin

In [3]:
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=300)
chunks = text_splitter.split_documents(data)
print("done splitting....")
print(f"Number of chunks: {len(chunks)}")

done splitting....
Number of chunks: 42


In [13]:
for  i in range(len(chunks)):
    print(f"Chunk {i}:",end = ' ')
    print("Length of chunk:", len(chunks[i].page_content))

Chunk 0: Length of chunk: 1191
Chunk 1: Length of chunk: 656
Chunk 2: Length of chunk: 835
Chunk 3: Length of chunk: 1046
Chunk 4: Length of chunk: 1156
Chunk 5: Length of chunk: 1093
Chunk 6: Length of chunk: 1081
Chunk 7: Length of chunk: 1037
Chunk 8: Length of chunk: 977
Chunk 9: Length of chunk: 1163
Chunk 10: Length of chunk: 1166
Chunk 11: Length of chunk: 1194
Chunk 12: Length of chunk: 1174
Chunk 13: Length of chunk: 749
Chunk 14: Length of chunk: 1160
Chunk 15: Length of chunk: 381
Chunk 16: Length of chunk: 1192
Chunk 17: Length of chunk: 1174
Chunk 18: Length of chunk: 946
Chunk 19: Length of chunk: 1055
Chunk 20: Length of chunk: 1186
Chunk 21: Length of chunk: 1166
Chunk 22: Length of chunk: 920
Chunk 23: Length of chunk: 1155
Chunk 24: Length of chunk: 775
Chunk 25: Length of chunk: 1193
Chunk 26: Length of chunk: 1126
Chunk 27: Length of chunk: 1159
Chunk 28: Length of chunk: 1150
Chunk 29: Length of chunk: 1155
Chunk 30: Length of chunk: 817
Chunk 31: Length of chunk: 

In [15]:
import ollama

ollama.pull("nomic-embed-text")

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [16]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="simple-rag",
)
print("done adding to vector database....")

done adding to vector database....


In [22]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_ollama import ChatOllama

from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [23]:
llm = ChatOllama(model=model)

In [29]:
# a simple technique to generate multiple questions from a single question and then retrieve documents
# based on those questions, getting the best of both worlds.
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [25]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), llm, prompt=QUERY_PROMPT
)

In [38]:
# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [34]:
print("prompt:", prompt) 

prompt: input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based ONLY on the following context:\n{context}\nQuestion: {question}\n'), additional_kwargs={})]


In [39]:
res = chain.invoke(input=("what is the document about?",))
# res = chain.invoke(
#     input=("what are the main points as a business owner I should be aware of?",)
# )
# res = chain.invoke(input=("how to report BOI?",))

print(res)


Based on the provided context, it appears that the document is about Beneficial Ownership Information Reporting (BOIR) instructions. The document seems to be a guide or set of guidelines for reporting beneficial ownership information, including the types of identifying documents that may be used and how they should be reported.
