### Inserting Data to vectorDB(Pinecone) through data ingestion pipeline

In [None]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone.grpc import PineconeGRPC as Pinecone
from sentence_transformers import SentenceTransformer

In [None]:
# Reading raw pdf
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    document = file_loader.load()
    return document

In [None]:
# Document splitting and chunking
def chunk_data(document , chunk_size = 700, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size , chunk_overlap=chunk_overlap)
    chunk = text_splitter.split_documents(document)
    return chunk


In [None]:
# Embedding using sentence-transformers/all-MiniLM-L12-v2 = 384D dense vector
def embedding_chunks(chunk):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
    text = [doc.page_content for doc in chunk]
    embeddings = model.encode(text)
    return embeddings

In [None]:
# Pinecone config
import os
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENVIRONMENT")
pc = Pinecone(api_key=pinecone_api_key)
index_name = "ilibrary-backend-documentation-rag"

In [None]:
documents = read_doc("../Data/raw")
print(len(documents))

In [None]:
chunks = chunk_data(documents)
print(len(chunks))

In [None]:
embeddings = embedding_chunks(chunks)
print(len(embeddings))

In [None]:
host = os.getenv("PINECONE_HOST")
index = pc.Index(host= host)
vectors = []

for i, embedding in enumerate(embeddings):
    vectors.append({
        "id": f"chunk-{i}",
        "values": embedding.tolist(),
        "metadata": {
            "text": chunks[i].page_content,
            "source": chunks[i].metadata.get("source"),
            "page": chunks[i].metadata.get("page")
        }
    })

In [None]:
def batch_upsert(index, vectors, namespace, batch_size=500):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch, namespace=namespace)

In [None]:
batch_upsert(
    index=index,
    vectors=vectors,
    namespace="rag-docs",
    batch_size=500   # safe value
)

In [None]:
# cosine similarity search
def retrieve_query(query,k=3):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
    query_embedding = model.encode(query).tolist()
    matching_result = index.query(
        vector=query_embedding,
        top_k=k,
        namespace="rag-docs",
        include_metadata=True
    )
    return matching_result

In [None]:
answer = retrieve_query("What is the role of User class?")
print(answer.matches)