In [None]:
## All together
!pip install chromadb # is used to save vectors Database
!pip install langchain
!pip install langchain-community
!pip install langchain-google-genai
!pip install pypdf



# Step-1: Import the packages

In [None]:
from langchain.prompts import PromptTemplate # Prompt template
from langchain.vectorstores import Chroma   # Store the vectors
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter # Chunks
from langchain.document_loaders import TextLoader  # Load the text
from langchain.chains import VectorDBQA,RetrievalQA, LLMChain # Chains and Retrival ans
from langchain.retrievers.multi_query import MultiQueryRetriever # Multiple Answers
from langchain_google_genai import ChatGoogleGenerativeAI # GenAI model to retrive
from langchain_google_genai import GoogleGenerativeAIEmbeddings # GenAI model to conver words

## Step-2: Load the data

In [None]:
file_path = "/content/budget_speech.pdf"
loader = PyPDFLoader(file_path=file_path)
data=loader.load()

# 3:Extract text from PDFs

In [None]:
text = "\n".join([page.page_content for page in data])

# Step-4: Divide into chunks

In [None]:
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200,separators=["\n\n", "\n", " ", ""])
texts = text_splitter.split_text(text)

## Step-5: Set up the models

- One is embedding model

- One is Chat model

In [None]:
# Set up embeddings
embeddings = GoogleGenerativeAIEmbeddings(
    model='models/embedding-001',
    google_api_key='AIzaSyDloThy416lCuOmC_t2wrAeMk2qjjH6Kc4',
    task_type="retrieval_query"
)


In [None]:
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory

safety_settings = {
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    }

chat_model = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key='AIzaSyDloThy416lCuOmC_t2wrAeMk2qjjH6Kc4',
    temperature=0.3,
    safety_settings=safety_settings
)


# Step-6: Get the Embeddings store in VectorDB

In [None]:
# Create the vector store
vectordb = Chroma.from_texts(texts=texts, embedding=embeddings)

# Step-7: Make the Prompt Template

In [None]:
prompt_template = """
## Safety and Respect Come First!

You are programmed to be a helpful and harmless AI. You will not answer requests that promote:

* **Harassment or Bullying:** Targeting individuals or groups with hateful or hurtful language.
* **Hate Speech:**  Content that attacks or demeans others based on race, ethnicity, religion, gender, sexual orientation, disability, or other protected characteristics.
* **Violence or Harm:**  Promoting or glorifying violence, illegal activities, or dangerous behavior.
* **Misinformation and Falsehoods:**  Spreading demonstrably false or misleading information.

**How to Use You:**

1. **Provide Context:** Give me background information on a topic.
2. **Ask Your Question:** Clearly state your question related to the provided context.

**Please Note:** If the user request violates these guidelines, you will respond with:
"I'm here to assist with safe and respectful interactions. Your query goes against my guidelines. Let's try something different that promotes a positive and inclusive environment."
You are a financial expert answering queries based on the provided context.
##  Answering User Question:

Answer the question as precisely as possible using the provided context. The context can be from different topics. Please make sure the context is highly related to the question. If the answer is not in the context, you only say "answer is not in the context".

Context: \n {context}
Question: \n {question}
Answer:
"""


prompt = PromptTemplate(template = prompt_template, input_variables=['context','question'])

# Step-8: Create tha QA chains

In [None]:
# Create the QA
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectordb.as_retriever(search_kwargs={"k": 5}),
                                                  llm=chat_model)

qa_chain = RetrievalQA.from_chain_type(llm=chat_model,
                                       retriever= retriever_from_llm,
                                       return_source_documents=True,
                                       chain_type="stuff",
                                       chain_type_kwargs={"prompt": prompt}
                                      )

In [None]:
# Run the query
response = qa_chain.invoke({"Can you say about tax system"})
print(response)

{'query': {'Can you say about tax system'}, 'result': 'The budget includes information on both direct and indirect taxes. The new tax regime proposes substantial relief with new slabs and tax rates. For example, income up to 4,00,000 is Nil, from 4,00,001 to 8,00,000 is 5 per cent, and so on, up to above 24,00,000 which is 30 per cent. The new structure will substantially reduce the taxes of the middle class. A tax payer in the new regime with an income of 12 lakh will get a benefit of 80,000 in tax. A person having income of 18 lakh will get a benefit of 70,000 in tax.  A person with an income of 25 lakh gets a benefit of 1,10,000.', 'source_documents': [Document(metadata={}, page_content='CONTENTS \n \nPART – A \n Page No. \nIntroduction 1 \nBudget Theme 1 \nAgriculture as the 1st engine 3 \nMSMEs as the 2nd engine 6 \nInvestment as the 3rd engine 8 \nA. Investing in People 8 \nB. Investing in the Economy 10 \nC. Investing in Innovation 14 \nExports as the 4th engine 15 \nReforms as 