### MongoDB as vectorDB

In [22]:
from pymongo import MongoClient
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import os
from dotenv import load_dotenv
load_dotenv()


True

In [23]:
# Reading raw pdf
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    docs = file_loader.load()
    return docs

In [24]:
# Document splitting and chunking
def chunk_data(documents,chunk_size=700,chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size,chunk_overlap=chunk_overlap)
    chunk = text_splitter.split_documents(documents)
    return chunk

In [25]:
# Embedding using sentence-transformers/all-MiniLM-L12-v2 = 384D dense vector
def embedding_chunks(chunks):
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
    
    texts = [chunk.page_content for chunk in chunks]

    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    return embeddings


In [26]:
mongo_uri= os.getenv("MONGO_URI")
client = MongoClient(mongo_uri)
db = client["vector_db"]
collection = db["documents"]
print("Connection Successfull.")

Connection Successfull.


In [27]:
documents = read_doc("../Data/raw")
documents

[Document(metadata={'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'iLibrary-Backend Internal Documentation', 'author': 'ChatGPT Deep Research', 'source': '..\\Data\\raw\\iLibrary-Backend Internal Documentation.pdf', 'total_pages': 19, 'page': 0, 'page_label': '1'}, page_content='iLibrary-Backend Internal Documentation\n1. Executive Overview\nThe iLibrary Backend is a Spring Boot monolithic application designed as a comprehensive RESTful API for\nmanaging a private study-library system. It addresses typical library needs: user account management, role-\nbased  access,  subscription  plans,  real-time  seat  booking,  and  secure  entry  via  QR  codes.  For\nexample, a student can sign up, purchase a weekly or monthly pass, reserve a study seat for a set duration,\nreceive a QR code by email, and then scan it for library access. All core logic – authentication, payment\nprocessing,  and  seat  management  –  is  implemented  within  a  single  deploya

In [28]:
chunks = chunk_data(documents)
print(len(chunks))

284


In [29]:
embeddings = embedding_chunks(chunks)
print(len(embeddings))

284


In [30]:
print(len(embeddings[0]))

384


In [31]:
print(type(embeddings))              # numpy.ndarray
print(embeddings.shape)              # (num_chunks, 384)
print(type(embeddings[0]))           # numpy.ndarray
print(type(embeddings[0][0]))        # float or numpy.float32

<class 'numpy.ndarray'>
(284, 384)
<class 'numpy.ndarray'>
<class 'numpy.float32'>


In [32]:
docs = []
for chunk, embed in zip(chunks, embeddings):
    docs.append({
        "text": chunk.page_content,
        "embedding": embed.astype(float).tolist(),  # 🔥 FLATTENED
        "metadata": chunk.metadata
    })

collection.delete_many({})
collection.insert_many(docs)


InsertManyResult([ObjectId('697a45d33ad7216b666ee212'), ObjectId('697a45d33ad7216b666ee213'), ObjectId('697a45d33ad7216b666ee214'), ObjectId('697a45d33ad7216b666ee215'), ObjectId('697a45d33ad7216b666ee216'), ObjectId('697a45d33ad7216b666ee217'), ObjectId('697a45d33ad7216b666ee218'), ObjectId('697a45d33ad7216b666ee219'), ObjectId('697a45d33ad7216b666ee21a'), ObjectId('697a45d33ad7216b666ee21b'), ObjectId('697a45d33ad7216b666ee21c'), ObjectId('697a45d33ad7216b666ee21d'), ObjectId('697a45d33ad7216b666ee21e'), ObjectId('697a45d33ad7216b666ee21f'), ObjectId('697a45d33ad7216b666ee220'), ObjectId('697a45d33ad7216b666ee221'), ObjectId('697a45d33ad7216b666ee222'), ObjectId('697a45d33ad7216b666ee223'), ObjectId('697a45d33ad7216b666ee224'), ObjectId('697a45d33ad7216b666ee225'), ObjectId('697a45d33ad7216b666ee226'), ObjectId('697a45d33ad7216b666ee227'), ObjectId('697a45d33ad7216b666ee228'), ObjectId('697a45d33ad7216b666ee229'), ObjectId('697a45d33ad7216b666ee22a'), ObjectId('697a45d33ad7216b666ee2

### Sample retrieval

In [42]:
def embed_query(query):
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
    vec = model.encode(query, normalize_embeddings=True)

    # HARD cast
    return [float(x) for x in vec]


In [43]:
def vector_search(query, top_k=5):
    query_vector = embed_query(query)

    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "path": "embedding",
                "queryVector": query_vector,
                "numCandidates": 100,
                "limit": top_k
            }
        }
    ]

    return list(collection.aggregate(pipeline))



In [46]:
query = "How does authentication work in Spring Boot?"

results = vector_search(query, top_k=5)
for r in results:
    print(f"Text: {r['text']}")
    print("-" * 50)

Text: 1.  Login Request: The user sends their email and password to `POST /auth/login`. 
2.  Authentication: Spring Security's `AuthenticationManager` uses the custom 
`UserInfoService` to load the user's details (including the hashed password) from the 
database. It compares the provided password with the stored hash. 
3.  Token Generation: If authentication is successful, the `AuthController` calls 
`jwtService.generateToken(username)`. 
4.  Inside `JwtService.generateToken`: 
    a.  It defines the claims for the JWT. A claim is a piece of information asserted about the 
subject. 
        -   `iss` (Issuer): The service that issued the token (e.g., "iLibrary-Backend").
--------------------------------------------------
Text: User Login (/public/login):
Controller receives credentials (username, password). 
It uses Spring Security’s AuthenticationManager to authenticate. The manager loads the user
from DB and compares the BCrypt hash. 
Upon successful authentication, the controller g

In [47]:
query = "How many entities classes are there in the project?"

results = vector_search(query, top_k=5)
for r in results:
    print(f"Text: {r['text']}")
    print("-" * 50)

Text: subscription), AdminController ( /admin), PaymentController ( /payment), and 
WebhookController ( /webhook). These classes orchestrate request handling and use Service
classes.
Models/Entities ( com.ilibrary.model): JPA entity classes representing the database tables (e.g. 
User, Seat, Booking, Subscription, Payment, etc.). These mirror the schema in ER-
Diagram.png
15
• 
• 
• 
1 16
• 
• 
• 
4
--------------------------------------------------
Text: . They often have annotations like @Entity and define relationships (@ManyToOne, 
@OneToMany).
DTOs ( com.ilibrary.dto): Data Transfer Objects used for request and response payloads (e.g. 
SignupRequest, LoginRequest, SeatBookingRequest, etc.). These are distinct from Entities
and often have validation annotations.
Repositories ( com.ilibrary.repository): Interfaces extending JpaRepository for each
Entity. For example, UserRepository, BookingRepository, etc. These provide CRUD database
access. Removing any repository would break datab