In [4]:
from langchain_community.llms import Ollama
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch
from operator import itemgetter
import os
from langchain.schema import Document


In [5]:
from langchain_community.document_loaders import PyPDFLoader

def load_pdfs_from_folder(folder_path):
    documents = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                loader = PyPDFLoader(file_path)
                documents.extend(loader.load_and_split())
    return documents


In [6]:

template = """
Answer the question based on the context below. If you cannot 
answer the question, reply "I do not know".

Context: {context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)


In [7]:

MODEL="llama2"
model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)

  model = Ollama(model=MODEL)
  embeddings = OllamaEmbeddings(model=MODEL)


In [8]:

folder_path =r'D:\project\ragollama\pdfs'
print(f"Loading PDFs from folder: {folder_path}")
documents = load_pdfs_from_folder(folder_path)
documents

Loading PDFs from folder: D:\project\ragollama\pdfs


[Document(metadata={'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13\\IT FAQ's\\GIT_Procedure_Document\\GIT_Setup_procedure_document.pdf", 'page': 0}, page_content='Netsmartz GIT SETUP HELP DOCUMENT \n \n \n \nMandatory software to be installed from the given URLs: \n• GIT - https://git-scm.com/download/win \n• Tortoise Git - https://download.tortoisegit.org/tgit/2.12.0.0/TortoiseGit-2.12.0.0-64bit.msi \nTo Access Netsmartz GIT Repo over the web: - \n Open the URL https://git.netsmartz.net by chrome or edge browser: \n \nClick on the Active directory tab -→enter your AD login credentials-→ Click Sign In'),
 Document(metadata={'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13\\IT FAQ's\\GIT_Procedure_Document\\GIT_Setup_procedure_document.pdf", 'page': 1}, page_content='Netsmartz GIT SETUP HELP DOCUMENT \n \nCode Check Out Procedure. \n• Click on the assigned project to take the checkout: - \n \n• Click on the clone tab and copy the URL shown under the “clone with

In [9]:
def split_pdf_pages_into_smaller_chunks(documents, chunk_size=500, overlap=200):
    """
    Splits the content of a list of Document objects into smaller chunks while preserving metadata.

    :param documents: List of Document objects with metadata and text content.
    :param chunk_size: Maximum size of each chunk.
    :param overlap: Number of characters to overlap between chunks.
    :return: List of smaller Document objects.
    """
    smaller_chunks = []
    
    for document in documents:
        text = document.page_content  # Extract the text content of the Document
        metadata = document.metadata  # Preserve the metadata
        start = 0
        
        while start < len(text):
            end = start + chunk_size
            chunk = text[start:end]
            smaller_chunks.append(
                Document(page_content=chunk, metadata=metadata)
            )
            start += chunk_size - overlap  # Move start forward by chunk_size minus overlap
    
    return smaller_chunks


In [10]:
chunks = split_pdf_pages_into_smaller_chunks(documents)
chunks

[Document(metadata={'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13\\IT FAQ's\\GIT_Procedure_Document\\GIT_Setup_procedure_document.pdf", 'page': 0}, page_content='Netsmartz GIT SETUP HELP DOCUMENT \n \n \n \nMandatory software to be installed from the given URLs: \n• GIT - https://git-scm.com/download/win \n• Tortoise Git - https://download.tortoisegit.org/tgit/2.12.0.0/TortoiseGit-2.12.0.0-64bit.msi \nTo Access Netsmartz GIT Repo over the web: - \n Open the URL https://git.netsmartz.net by chrome or edge browser: \n \nClick on the Active directory tab -→enter your AD login credentials-→ Click Sign In'),
 Document(metadata={'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13\\IT FAQ's\\GIT_Procedure_Document\\GIT_Setup_procedure_document.pdf", 'page': 0}, page_content='s://git.netsmartz.net by chrome or edge browser: \n \nClick on the Active directory tab -→enter your AD login credentials-→ Click Sign In'),
 Document(metadata={'source': "D:\\project\\ragollama\\p

In [8]:
len(chunks)

186

In [9]:
len(documents)

48

In [10]:
chunks[100]

Document(metadata={'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13\\IT FAQ's\\Netsmartz_Information_Security_Guidelines\\NTZ_ITD_AUP_1.1_005.pdf", 'page': 11}, page_content="urity of any host, network, or account.  \no Introducing honeypots, honeynets, or similar technology on the NETSMARTZ network.  \no Interfering with or denying service to any user other than the employee's host (for \nexample, denial of service attack).  \no Using any program/script/command, or sending messages of any kind, with the intent \nto interfere with, or disable, a user's terminal session, via any means, locally or via the \nInternet/Intranet/Extranet.  \no Providing information about, or list")

In [11]:
vectorstore = DocArrayInMemorySearch.from_documents(chunks, embedding=embeddings)
retriever = vectorstore.as_retriever()



In [12]:
retriever.invoke('Why my EPF contribution of last months is not reflecting under my EPF passbook?')

[Document(metadata={'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13 (2)\\Finance FAQ's\\FAQ's related to exit from company.pdf", 'page': 0}, page_content='ing? \n \nAnswer: Form 16 is generated by end of June in next financial year, kindly drop mail at \nteam.payroll@netsmartz.com from your personal email id mentioning your name and \nPAN number, the same will be shared in June 2024 digitally signed via email.'),
 Document(metadata={'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13 (2)\\Finance FAQ's\\Provident fund related FAQ's.pdf", 'page': 0}, page_content='o.11 and that will be shared with Payroll team for your EPF registration. \nFor deduction part:   EPF deduction is based on basic salary and maximum deducted upto \nRs 15000 basic salary , both employer and employee PF deduction is done from employee \nCTC and once opted EPF deduction cannot be stopped. Few examples of EPF deduction \nare as below  \n \nBasic Salary Basic Salary for EPF deduction PF deduc

In [13]:
chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
)


In [14]:
questions = [
    
    "Why my EPF contribution of last months is not reflecting under my EPF passbook?",
    "Where I have to fill the Income tax declaration?",
    "who is leonal messi?"
]

In [15]:
for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()

Question: Why my EPF contribution of last months is not reflecting under my EPF passbook?
Answer: Based on the provided context, it seems that the employee is inquiring about why their EPF contribution from previous months is not reflected in their EPF passbook. The document provides information on how employer EPF part is shown under EPF passbook, and mentions that the amount you can withdraw in cash form is the employee PF contribution and the employer part contribution. However, it does not provide a direct answer to the employee's question about why their previous months' EPF contribution is not reflected in their passbook.

Therefore, I do not know the specific reason why the employee's EPF contribution from previous months is not reflecting in their passbook, as the provided document does not provide a clear explanation for this situation.

Question: Where I have to fill the Income tax declaration?
Answer: Based on the context provided, it appears that the Income Tax Declaration 

In [16]:
#What if I want to go for higher EPF deduction? \nAnswer : For higher EPF deduction, the EPF will be deducted based on your currentbasic \nsalary ( i.e. 24% of your basic salary – 12% Employee Contribution+ 12% Employer \ncontribution) for which ticket needs to be raised on Darwin Helpdesk to us.

In [17]:
question = "What if I want to go for higher EPF deduction explain with example ?"
print(f"Question: {question}")
print(f"Answer: {chain.invoke({'question': question})}")
print()

Question: What if I want to go for higher EPF deduction explain with example ?
Answer: Based on the provided documents, it seems that you are looking for information related to Employees' Provident Fund (EPF) deductions. If you want to go for a higher EPF deduction, here is an explanation and an example based on the documents provided:

Answer: You can opt for a higher EPF deduction by submitting a request to your employer. The amount of EPF deduction is determined by the employer's contribution, which is 12% of the basic salary. However, if you want to contribute more than the mandatory 12%, you can submit a request to your employer to increase the EPF deduction.

For example, let's say your basic salary is Rs 1,00,000 and the employer's contribution is 12% of that amount, which is Rs 12,000 (12% of Rs 1,00,000). If you want to contribute an additional Rs 5,000 towards EPF, your total EPF deduction would be Rs 17,000 (Rs 12,000 + Rs 5,000).

So, if you want to go for a higher EPF dedu

In [18]:
# When we have to submit the proofs for Tax saving documents? \n \nAnswer: The Tax Saving documents are required to be submitted between period of Jan \nto  Feb 2024, company-wide email will be sent in this regard. 

In [20]:
question = "When we have to submit the proofs for Tax saving documents?"
print(f"Question: {question}")
print(f"Answer: {chain.invoke({'question': question})}")
print()

Question: When we have to submit the proofs for Tax saving documents?
Answer: Based on the context provided, it appears that the company is asking when to submit proofs for tax-saving documents. The document mentions that Form 16 will be generated by the end of June in the next financial year and the employee is requested to drop an email at team.payroll@netsmartz.com from their personal email ID mentioning their name and PAN number. This suggests that the company will share the tax-saving documents, including Form 16, with the employees digitally signed via email by June of the next financial year. Therefore, the answer to the question is "By June of the next financial year."



In [11]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(chunks, embedding=embeddings)

In [12]:
my_path='chroma_db'
vectorstore = Chroma.from_documents(chunks, embedding=embeddings,persist_directory=my_path)



In [13]:
vectorstore.persist()

  vectorstore.persist()


In [15]:
new_vec=Chroma(persist_directory=my_path,embedding_function=embeddings)

In [18]:
retriever=new_vec.as_retriever()

In [19]:
retriever.invoke('FAQ’s on Employee ESI 1. What is the eligibility criteria of ESI deduction? ')

[Document(metadata={'page': 0, 'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13 (2)\\Finance FAQ's\\FAQ's related to exit from company.pdf"}, page_content='ing? \n \nAnswer: Form 16 is generated by end of June in next financial year, kindly drop mail at \nteam.payroll@netsmartz.com from your personal email id mentioning your name and \nPAN number, the same will be shared in June 2024 digitally signed via email.'),
 Document(metadata={'page': 1, 'source': "D:\\project\\ragollama\\pdfs\\OneDrive_2025-01-13 (2)\\Finance FAQ's\\Provident fund related FAQ's.pdf"}, page_content='ary. \n6. How employer EPF part is shown under EPF passbook? \nAnswer: Employer EPF part is divided into two parts. Normal Contribution and Pension \nContribution. The amount you can withdraw in cash form is employee PF contribution and \nemployer part contribution. For amount under pension contribution you will get the pension \nonly.  \nFor example:  Employer part of Rs 1800 is divided into two parts i.

In [None]:
import secrets
print(secrets.token_hex(32))

In [23]:
pip install itsdangerous


Collecting itsdangerousNote: you may need to restart the kernel to use updated packages.

  Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Installing collected packages: itsdangerous
Successfully installed itsdangerous-2.2.0


You should consider upgrading via the 'd:\project\ragollama\myvenv\Scripts\python.exe -m pip install --upgrade pip' command.


In [24]:
# from fastapi.sessions import SessionMiddleware
