## Import Libraries

In [200]:
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Pinecone
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import os

## Importing document

In [201]:
# Lets Read the document
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [202]:
doc = read_doc('./document')
len(doc)

266

## Create Chunks

In [203]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return docs

In [204]:
documents = chunk_data(docs=doc)
len(documents)

266

## Embeddings

In [205]:
embeddings = OllamaEmbeddings(
    model="llama3.1"
)
embeddings

OllamaEmbeddings(model='llama3.1', base_url=None, client_kwargs={})

In [206]:
input_text = "The meaning of life is 42"
vector = embeddings.embed_query(input_text)
print(vector[:3])

[0.0142708905, 0.0037594007, 0.0086252345]


## PineCone

In [207]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "government"

In [208]:
index = pc.Index(index_name)

In [209]:
if index is not None:
    print(f"Index {index_name} already exists")
else:
    index = pc.create_index(name=index_name, metric="cosine", dimension=4096,
                            spec=ServerlessSpec(cloud="aws", region="us-east-1"))

Index government already exists


In [210]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings, index_name=index_name)

In [211]:
# vector_store.add_documents(documents=doc)

## Loading Gemini

In [213]:
from dotenv import load_dotenv
load_dotenv()

True

In [218]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro", google_api_key=os.getenv("GEMINI_API_KEY"), temperature=0.5)

In [215]:
chain = load_qa_chain(llm,chain_type="stuff")

In [None]:
# Cosine Similarity Retreive Results from VectorDB
def retrieve_query(store, query, k=5):
    matching_results = store.similarity_search(query, k=k)
    return matching_results

In [216]:
def retrieve_answers(store, query):
    doc_search = retrieve_query(store, query)
    print(doc_search)
    response = chain.run(input_documents=doc_search, question=query)
    return response

In [222]:
our_query = "Tell me about some of the msme schemes you know"
answer = retrieve_answers(vector_store, our_query)
print(answer)

[Document(id='67427ca5-b655-4790-b6e8-48f6af93c113', metadata={'page': 17.0, 'source': 'document/MSME_Schemes.pdf'}, page_content='XV.MINISTRY OF CONSUMER AFFAIRS, FOOD AND\nPUBLIC DISTRIBUTION SCHEMES\nXVI. MINISTR Y OF DEFENCE SCHEMES\nXVII. NITI AAYOG SCHEMES \nXVIII. MINISTR Y OF AGRICUL TURE 139\n1.Antyodaya Anna Yojana (AA Y) 140\n2.Private Entrepreneurs Guarantee (PEG) 141\n143\n1.Allotment of Class-V  ‘B’ Army surplus vehicles 144\n2.Allotment of Mother Dairy Milk booths and Safal 145\nshops\n3.Allotment of Regular LPG Distributorship under 18% 146\nquota\n4.Coal Loading and Transportation 147\n5.Coal Tipper Attachment 148\n6.Gopaljee Dairy Milk booths/Milk shops/Retail outlets 149\n7.Gopaljee Farm Fresh 149\n8.Management of CNG station 150\n151\nSelf Employment and Talent Utilisation (SETU) 152\n153\n154\n154\n1.Marketing Research and Information Network 154\n2.Strengthening of Agmark Grading Facilities 154\n3.Development/Strengthening of Agricultural 154\nMarketing Infrastruc