In [1]:
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

from langchain_perplexity.chat_models import ChatPerplexity

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# Lets read the File Documents

def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [4]:
doc=read_doc("documents/")
len(doc)

52

In [5]:
# Divide the doc into chunks

def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap)
    doc= text_splitter.split_documents(docs)
    return docs

In [6]:
documents=chunk_data(docs=doc)
documents

[Document(metadata={'producer': 'Skia/PDF m139 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'budget_speech', 'source': 'documents\\budget_speech.pdf', 'total_pages': 52, 'page': 0, 'page_label': '1'}, page_content='GOVERNMENT  OF  INDIA   \nBUDGET  2024-2025   \nSPEECH   OF   NIRMALA  SITHARAMAN  \nMINISTER\n \nOF\n \nFINANCE\n \n \nJuly  23,  2024  CONTENTS   \nPART  –  A   \nPage  No.   \nIntroduction  1  Global  Context  1  Interim  Budget  2  Budget  Theme  2  \nBudget\n \nPriorities\n \n2\n \n(i)\n \nProductivity\n \nand\n \nresilience\n \nin\n \nAgriculture\n \n \n(ii)  Employment  &  Skilling   \n(iii)  Inclusive  Human  Resource  Development  and  Social  Justice  \n(iv)\n \nManufacturing\n \n&\n \nServices\n \n \n(v)  Urban  Development   \n(vi)  Energy  Security   \n(vii)  Infrastructure   \n(viii)  Innovation,  Research  &  Development   \n(ix)  Next  Generation  Reforms'),
 Document(metadata={'producer': 'Skia/PDF m139 Google Docs Renderer', 'crea

In [7]:
len(documents)

52

In [8]:
# Embeddings Techniques of OpenAI

embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001C4130EA900>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001C4130EB230>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectors = embeddings.embed_query("How are you?")
print(len(vectors))  # vector size

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


384


In [10]:
from pinecone import Pinecone, ServerlessSpec

# Create Pinecone client
pc = Pinecone(api_key="pcsk_2kALoP_5tgVShzqH3QgtKAeKP5Auei5QwmTbppNa8Y8xr2wNmJgWpnSiYFenr7JMxLwQ98")

# Create an index (only once, check before creating)
index_name = "langchainqachatbot"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # depends on your embedding model
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

# Connect to the index
index = pc.Index(index_name)

In [11]:
# index=Pinecone.from_documents(doc,embeddings,index_name=index_name)

In [12]:
from langchain_pinecone import PineconeVectorStore

# Create vectorstore from documents
index = PineconeVectorStore.from_documents(
    documents=doc,
    embedding=embeddings,
    index_name=index_name
)


In [13]:
## Cosine Similarity Retrieve Results from VectorDB
def retrieve_query(query, k=2):
    matching_results = index.similarity_search(query, k=k)
    return matching_results

In [14]:
llm = ChatPerplexity(
    model="sonar-pro",        # or choose another Perplexity model
    temperature=0.5
)

In [15]:
# Build your QA chain
# Below one is old way of building QA chain, but it is not recommended anymore.
'''from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")'''

# New way of building QA chain using `create_stuff_documents_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    "Answer the following question based on the given context:\n\n{context}\n\nQuestion: {input}"
)

chain = create_stuff_documents_chain(llm, prompt)


In [16]:
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print(doc_search)
    response = chain.invoke({"context": doc_search, "input": query})
    return response

In [17]:
our_query = "How much the agriculture target will be increased by how many crore?"
answer = retrieve_answers(our_query)
print(answer)

[Document(id='4018592c-f3b1-4ad6-8229-220ed5270c34', metadata={'creationdate': '', 'creator': 'PyPDF', 'page': 2.0, 'page_label': '3', 'producer': 'Skia/PDF m139 Google Docs Renderer', 'source': 'documents\\budget_speech.pdf', 'title': 'budget_speech', 'total_pages': 52.0}, page_content='2   \nInterim  Budget   \n4.  As  mentioned  in  the  interim  budget,  we  need  to  focus  on  4  major   castes,  \nnamely\n \n‘Garib’\n \n(Poor),\n \n‘Mahilayen’\n \n(Women),\n \n‘Yuva’\n \n(Youth)\n \nand\n  \n‘Annadata’\n \n(Farmer).\n \nFor\n \nAnnadata,\n \nwe\n \nannounced\n \nhigher\n \nMinimum\n \nSupport\n  \nPrices\n \na\n \nmonth\n \nago\n \nfor\n \nall\n \nmajor\n \ncrops,\n \ndelivering\n \non\n \nthe\n \npromise\n \nof\n \nat\n \nleast\n \na\n  \n50\n \nper\n \ncent\n \nmargin\n \nover\n \ncosts.\n \nPradhan\n \nMantri\n \nGarib\n \nKalyan\n \nAnna\n \nYojana\n \nwas\n  \nextended\n \nfor\n \nfive\n \nyears,\n \nbenefitting\n \nmore\n \nthan\n \n80\n \ncrore\n \npeople.\n  \n \n5.  Adm

In [18]:
our_query = "What are the nine key priority areas outlined in the 2024-25 budget, and how do they aim to support India’s development?"
answer = retrieve_answers(our_query)
print(answer)

[Document(id='e4810e2a-a315-4563-b676-50a8f3a31e92', metadata={'creationdate': '', 'creator': 'PyPDF', 'page': 0.0, 'page_label': '1', 'producer': 'Skia/PDF m139 Google Docs Renderer', 'source': 'documents\\budget_speech.pdf', 'title': 'budget_speech', 'total_pages': 52.0}, page_content='GOVERNMENT  OF  INDIA   \nBUDGET  2024-2025   \nSPEECH   OF   NIRMALA  SITHARAMAN  \nMINISTER\n \nOF\n \nFINANCE\n \n \nJuly  23,  2024  CONTENTS   \nPART  –  A   \nPage  No.   \nIntroduction  1  Global  Context  1  Interim  Budget  2  Budget  Theme  2  \nBudget\n \nPriorities\n \n2\n \n(i)\n \nProductivity\n \nand\n \nresilience\n \nin\n \nAgriculture\n \n \n(ii)  Employment  &  Skilling   \n(iii)  Inclusive  Human  Resource  Development  and  Social  Justice  \n(iv)\n \nManufacturing\n \n&\n \nServices\n \n \n(v)  Urban  Development   \n(vi)  Energy  Security   \n(vii)  Infrastructure   \n(viii)  Innovation,  Research  &  Development   \n(ix)  Next  Generation  Reforms'), Document(id='07c07213-af5f-4

In [19]:
our_query = "What are the eligibility, benefits, and anticipated outcomes of the five new employment and skilling schemes announced for India’s youth?"
answer = retrieve_answers(our_query)
print(answer)

[Document(id='e5a7f136-2d5c-4792-9a58-d27a0f447800', metadata={'creationdate': '', 'creator': 'PyPDF', 'page': 28.0, 'page_label': '29', 'producer': 'Skia/PDF m139 Google Docs Renderer', 'source': 'documents\\budget_speech.pdf', 'title': 'budget_speech', 'total_pages': 52.0}, page_content='•\n \nOne\n \ncrore\n \nyouth\n \nto\n \nbe\n \nskilled\n \nby\n \nIndia’s\n \ntop\n \ncompanies\n \nin\n \nfive\n \nyears.\n \n•\n Twelve  months  Prime  Minister’s  Internship  with  monthly  allowance  of   ` 5,000   \n•\n \nApplicable\n \nto\n \nthose\n \nwho\n \nare\n \nnot\n \nemployed\n \nand\n \nnot\n \nengaged\n \nin\n \nfull\n  \ntime\n education.   \n•\n \nYouth\n \naged\n \nbetween\n \n21\n \nand\n \n24\n \nwill\n \nbe\n \neligible\n \nto\n \napply.\n \n \n•\n \nCost\n \nsharing\n \n(per\n \nannum):\n \n \n▪\n \nGovernment\n \n–\n \n`\n54,000\n \ntowards\n \nmonthly\n \nallowance\n \n(plus\n \n`\n6,000\n  grant  for  incidentals)   \n▪\n \nCompany\n \n–\n \nRs\n \n6,000\n \nfrom\n \nCSR\n