### Inserting Data to vectorDB(Pinecone) through data ingestion pipeline

In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone.grpc import PineconeGRPC as Pinecone
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Reading raw pdf
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    document = file_loader.load()
    return document

In [3]:
# Document splitting and chunking
def chunk_data(document , chunk_size = 700, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size , chunk_overlap=chunk_overlap)
    chunk = text_splitter.split_documents(document)
    return chunk


In [4]:
# Embedding using sentence-transformers/all-MiniLM-L12-v2 = 384D dense vector
def embedding_chunks(chunk):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
    text = [doc.page_content for doc in chunk]
    embeddings = model.encode(text)
    return embeddings

In [5]:
# Pinecone config
import os
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENVIRONMENT")
pc = Pinecone(api_key=pinecone_api_key)
index_name = "ilibrary-backend-documentation-rag"

In [6]:
documents = read_doc("../Data/raw")
print(len(documents))

77


In [7]:
chunks = chunk_data(documents)
print(len(chunks))

284


In [8]:
embeddings = embedding_chunks(chunks)
print(len(embeddings))

284


In [9]:
host = os.getenv("PINECONE_HOST")
index = pc.Index(host= host)
vectors = []

for i, embedding in enumerate(embeddings):
    vectors.append({
        "id": f"chunk-{i}",
        "values": embedding.tolist(),
        "metadata": {
            "text": chunks[i].page_content,
            "source": chunks[i].metadata.get("source"),
            "page": chunks[i].metadata.get("page")
        }
    })

In [10]:
def batch_upsert(index, vectors, namespace, batch_size=500):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch, namespace=namespace)

In [11]:
batch_upsert(
    index=index,
    vectors=vectors,
    namespace="rag-docs",
    batch_size=500   # safe value
)

In [12]:
# cosine similarity search
def retrieve_query(query,k=3):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
    query_embedding = model.encode(query).tolist()
    matching_result = index.query(
        vector=query_embedding,
        top_k=k,
        namespace="rag-docs",
        include_metadata=True
    )
    return matching_result

In [16]:
answer = retrieve_query("What is the role of User class?")
print(answer.matches)

[{'id': 'chunk-233',
 'metadata': {'page': 41.0,
              'source': '..\\Data\\raw\\Internal Knowledge Base - iLibrary '
                        'Backend.pdf',
              'text': '-   User PII: Personally Identifiable Information in '
                      'the `users` table (name, email). \n'
                      '    -   System Integrity: The correctness of the seat '
                      'and booking status. \n'
                      '    -   System Availability: The ability of legitimate '
                      'users to access the service. \n'
                      '-   Goals of an Attacker: \n'
                      '    -   Gain free access to the library (book a seat '
                      'without a valid subscription). \n'
                      "    -   View or steal other users' personal "
                      'information. \n'
                      '    -   Prevent other users from booking seats (Denial '
                      'of Service). \n'
                 