## Import Libraries

In [200]:
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Pinecone
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import os

## Importing document

In [201]:
# Lets Read the document
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [202]:
doc = read_doc('./document')
len(doc)

266

## Create Chunks

In [203]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return docs

In [204]:
documents = chunk_data(docs=doc)
len(documents)

266

## Embeddings

In [205]:
embeddings = OllamaEmbeddings(
    model="llama3.1"
)
embeddings

OllamaEmbeddings(model='llama3.1', base_url=None, client_kwargs={})

In [206]:
input_text = "The meaning of life is 42"
vector = embeddings.embed_query(input_text)
print(vector[:3])

[0.0142708905, 0.0037594007, 0.0086252345]


## PineCone

In [207]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "government"

In [208]:
index = pc.Index(index_name)

In [209]:
if index is not None:
    print(f"Index {index_name} already exists")
else:
    index = pc.create_index(name=index_name, metric="cosine", dimension=4096,
                            spec=ServerlessSpec(cloud="aws", region="us-east-1"))

Index government already exists


In [210]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings, index_name=index_name)

In [211]:
# vector_store.add_documents(documents=doc)

In [212]:
# Cosine Similarity Retreive Results from VectorDB
def retrieve_query(store, query, k=5):
    matching_results = store.similarity_search(query, k=k)
    return matching_results

## Loading Gemini

In [213]:
from dotenv import load_dotenv
load_dotenv()

True

In [214]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro", google_api_key=os.getenv("GEMINI_API_KEY"), temperature=0.5)

In [215]:
chain = load_qa_chain(llm,chain_type="stuff")

In [216]:
def retrieve_answers(store, query):
    doc_search = retrieve_query(store, query)
    print(doc_search)
    response = chain.run(input_documents=doc_search, question=query)
    return response

In [217]:
our_query = "Find me msmse schemes by niti ayog"
answer = retrieve_answers(vector_store, our_query)
print(answer)

[Document(id='0e83091b-c99f-43ee-9a8d-9c782d17403d', metadata={'page': 176.0, 'source': 'document/MSME_Schemes.pdf'}, page_content='NITI AAYOG SCHEMES'), Document(id='5917c908-29ec-4999-b27d-ad0cd5b9cad4', metadata={'page': 230.0, 'source': 'document/MSME_Schemes.pdf'}, page_content='background, would suffice for being eligible for \nthis selection under Skills and Placement under \nASDP .\nHow to apply? Eligible entrepreneurs can approach the nearest \nPIAs (Project Implementing Agencies) are for \nprofit or not-for-profit registered entities \nidentified by ASDP  for skilling and placing the \nidentified youth in various sectors as per their \ninterest.\nDescription The Mahatma Gandhi National Rural \nEmployment Guarantee Act, 2005 (MGNREGA) \nwas notified on September 7, 2005.The mandate \nof the Act is to provide at least 100 days of \nguaranteed wage employment in a financial year \nto every rural household whose adult members \nvolunteer to do unskilled manual work.\nMGNREGA  is 