In [10]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
loader = DirectoryLoader('data/', "*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [52]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(documents)

In [53]:
len(documents), len(text_chunks)

(4505, 40000)

In [54]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [55]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

api_key = load_dotenv('PINECONE_API_KEY')
pc = Pinecone(api_key=api_key)

In [None]:
index_name = "midster-bot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region="us-east-1"
        )
    )

In [57]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    index_name=index_name,
    documents=text_chunks,
    embedding=embeddings
)

In [58]:
documents = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [59]:
retriever = documents.as_retriever(search_type='similarity', search_kwargs={"k" : 3})