In [48]:
print("all ok")

all ok


In [49]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [50]:
load_dotenv()

True

In [51]:
llm=ChatGroq(model="qwen/qwen3-32b")

In [52]:
#llm.invoke("Top 5 Best Area to live in Auckland?")

In [53]:
embedding_model=GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [54]:
#embedding_model.embed_query("Why sky is blue?")

## 1. Data Ingestion

In [55]:
from langchain.document_loaders import PyPDFLoader

In [56]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

In [57]:
import os

In [58]:
os.getcwd()

'/Users/monikabadadhe/Documents/AI_Projects/document_portal_monika/notebook'

In [59]:
folder_path=os.path.join(os.getcwd(),"data","8_Hague_Place")

In [60]:
all_docs = []

In [61]:
for file in os.listdir(folder_path):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, file)
        loader = PyPDFLoader(pdf_path)
        all_docs.extend(loader.load())


In [62]:
print(f"Loaded a total of {len(all_docs)} document pages from all PDFs.")

Loaded a total of 60 document pages from all PDFs.


In [63]:
len(all_docs)

60

##### Note: Every document above provides an individual page

##### this is a experimental there is no deterministic way to split the text

In [64]:

text_splitter= RecursiveCharacterTextSplitter(

    chunk_size= 500,
    chunk_overlap=150,
    length_function=len
)

In [65]:
docs=text_splitter.split_documents(all_docs)
len(docs)

295

In [66]:
docs[0].metadata

{'producer': 'iText® 5.5.2 ©2000-2014 iText Group NV (AGPL-version)',
 'creator': 'PyPDF',
 'creationdate': '2025-07-10T14:23:22+12:00',
 'moddate': '2025-07-10T14:58:10+12:00',
 'source': '/Users/monikabadadhe/Documents/AI_Projects/document_portal_monika/notebook/data/8_Hague_Place/LIM - 8 Hague Place, Birkenhead.pdf',
 'total_pages': 34,
 'page': 0,
 'page_label': '1'}

In [67]:
docs[0].page_content

'Cooper & Co Real Estate Ltd \nLicensed REAA 2008\nLand Information Memorandum (LIM) Disclaimer\nDisclaimer:\nThis Land Information Memorandum (LIM) has been obtained from the Auckland Council and copies are \nmade available to prospective Purchasers and interested parties for general information purposes only. \nThe information originates from a third party (Auckland Council) and not from the real estate agent, \nCooper and Co Real Estate Ltd, or from the Vendors.'

In [68]:
len(embedding_model.embed_documents(docs[0].page_content)[0])

768

In [69]:
vectors=FAISS.from_documents(docs, embedding_model)

In [70]:
#### In Memory Vector Store or disk storage: FAISS
#### Cloud: pineccne, weaviate, milvus, mongodbvectorserach,astradb

## Retrieval Process

In [71]:
relevant_document=vectors.similarity_search("Can development be done on this property?")
relevant_document

[Document(id='410571f2-386c-4ad7-a370-2a7b58029d10', metadata={'producer': 'iText® 5.5.2 ©2000-2014 iText Group NV (AGPL-version)', 'creator': 'PyPDF', 'creationdate': '2025-07-10T14:23:22+12:00', 'moddate': '2025-07-10T14:58:10+12:00', 'source': '/Users/monikabadadhe/Documents/AI_Projects/document_portal_monika/notebook/data/8_Hague_Place/LIM - 8 Hague Place, Birkenhead.pdf', 'total_pages': 34, 'page': 5, 'page_label': '6'}, page_content='Page 5 \nAuckland Council (09) 301 0101 or enquiries@aucklandcouncil.govt.nz8270611814LIM 10/07/2025\nResource Management\nPlanning\nThere are NO Planning resource consents recorded.\nSubdivisions\nThere are NO Subdivision resource consents recorded.\nEngineering Approvals\nThere are NO Engineering approvals recorded.\nIf there are any conditions, please refer to the Property File. The applicant should satisfy themselves as to'),
 Document(id='b8aa9662-16dd-448e-8ce6-19aa0280ce69', metadata={'producer': 'iText® 5.5.2 ©2000-2014 iText Group NV (AGPL-v

In [72]:
relevant_document[2].page_content

'If there are any conditions, please refer to the Property File. The applicant should satisfy themselves as to \nwhether all conditions of resource consents for this property have been met. To obtain Resource Consent \ndocuments a Property File will need to be requested How to order a property file (aucklandcouncil.govt.nz) \n \nFurther Information\nThe Council may hold additional information for this property, for example concerning resource consents for'

In [73]:
relevant_document=vectors.similarity_search("Can development be done on this property?",k=10)

In [74]:
retriever=vectors.as_retriever()

### Question: user question
### Context: based on the question retrieving the info from the vector database

In [75]:
prompt_template = """
        Answer the question based on the context provided below. 
        If the context does not contain sufficient information, respond with: 
        "I do not have enough information about this."

        Context: {context}

        Question: {question}

        Answer:"""


In [76]:
from langchain.prompts import PromptTemplate


In [77]:

prompt=PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [78]:
prompt


PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\n        Answer the question based on the context provided below. \n        If the context does not contain sufficient information, respond with: \n        "I do not have enough information about this."\n\n        Context: {context}\n\n        Question: {question}\n\n        Answer:')

In [79]:
from langchain_core.output_parsers import StrOutputParser


In [80]:
parser=StrOutputParser()


In [81]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [82]:
from langchain_core.runnables import RunnablePassthrough

In [83]:

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [84]:
rag_chain.invoke("was there any damage because of any weather event in this property")

'<think>\nOkay, let\'s try to figure out if there was any damage to the property due to a weather event based on the context provided.\n\nFirst, looking at the context, there\'s a disclosure section from the vendor. They mention two possible severe weather events: flash flooding on specific dates in January 2023 and Cyclone Gabrielle in February 2023. The vendor\'s statement says "To the best of the Vendor\'s knowledge, the property was/ was not (delete one) affected by one or both of the following..." So, the key here is whether the vendor crossed out "was not" or "was". If they deleted "was not", then the property was affected; if they deleted "was", then it wasn\'t.\n\nBut wait, in the context provided, it\'s written as "was/ was not (delete one)". The way it\'s presented here, the user might not have the actual document where one is crossed out. Since the context as given includes both options and doesn\'t specify which was deleted, we don\'t know for sure. There\'s no indication i

In [87]:
rag_chain.invoke("What is the cladding material? give me the name of cladding")

'<think>\nOkay, I need to answer the user\'s question based on the provided context. Let me check the context first. Wait, the context is empty here. The user mentioned that if the context doesn\'t have enough info, I should respond with "I do not have enough information about this." Since there\'s no context given, I definitely can\'t find the answer there. So I should let them know I don\'t have enough info.\n</think>\n\nI do not have enough information about this.'