Load Pdf and split into chunks

In [1]:
!pip install langchain_community pymupdf
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import numpy as np

Defaulting to user installation because normal site-packages is not writeable


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_path = "APJAbdulKalam.pdf" 
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

In [3]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
split_docs = splitter.split_documents(docs)
documents = [doc.page_content for doc in split_docs]
metadatas = [{"source": doc.metadata.get("page", "unknown")} for doc in split_docs]


Embedding Model - Jina AI

In [4]:
embedding_model_name = "jinaai/jina-embeddings-v2-base-code"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name, trust_remote_code=True)
embedding_model = AutoModel.from_pretrained(embedding_model_name, trust_remote_code=True)

def get_embedding(texts):
    inputs = embedding_tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = embedding_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy()



In [5]:
embeddings = get_embedding(documents)
embeddings

array([[ 0.3655887 ,  0.9712121 , -0.7063259 , ..., -0.29313967,
        -0.30289736,  0.19185378],
       [ 0.64828473,  0.24969733, -0.34193242, ...,  0.25982213,
        -0.14664198,  0.14931633],
       [ 1.4192857 ,  0.05153514, -0.3239099 , ...,  0.229352  ,
        -0.4485154 ,  0.1909687 ],
       ...,
       [ 0.04847077,  0.60588354, -0.9268269 , ..., -0.5859429 ,
        -0.10997654,  0.49540496],
       [ 0.4140241 ,  1.2307209 , -0.41796866, ..., -0.4343838 ,
        -0.22568645,  0.9180646 ],
       [ 0.77146244,  1.0265812 , -0.30603454, ..., -0.31390408,
         0.02288594,  1.0789237 ]], dtype=float32)

Qdrant Database

In [6]:
!pip install qdrant-client

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


sudo docker pull qdrant/qdrant

sudo docker run -p 6333:6333 -p 6334:6334 \
    -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
    qdrant/qdrant

In [7]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

# Connect to local Qdrant
client = QdrantClient(host="localhost", port=6333)

client.recreate_collection(
    collection_name="kalam_collection",
    vectors_config=VectorParams(
        size=768,                    # Use correct vector size here
        distance=Distance.COSINE     # Or Distance.DOT / Distance.EUCLID
    )
)

  client.recreate_collection(


True

In [8]:
ids = list(range(len(embeddings)))
vectors = [vec.tolist() for vec in embeddings]
payloads = [{"text": doc} for doc in documents]

client.upsert(
    collection_name="kalam_collection",
    wait=True,
    points=[
        {
            "id": id_,
            "vector": vector,
            "payload": payload
        }
        for id_, vector, payload in zip(ids, vectors, payloads)
    ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

Query

In [24]:
query_text = "Why abdul kalam is famous?"
query_embedding = get_embedding([query_text])[0].tolist()  # your embedding function

Qdrant Results

In [25]:
qdrant_search_results = client.search(
    collection_name="kalam_collection",
    query_vector=query_embedding,
    limit=3,            # top 3 results
    with_payload=True   # to get stored text with results
)
qdrant_relevant_docs = [hit.payload['text'] for hit in qdrant_search_results]

for hit in qdrant_search_results:
    print(f"Score: {hit.score:.4f}")
    print(f"Text: {hit.payload['text']}")
    print("-----")


Score: 0.5864
Text: initiated the use of carbon-carbon and carbon-polymer 
materials for production of floor reaction orthosis calipers 
which has reduced the weight of the caliper to 1/10th of the 
original weight during 1995 – 1996. Over 50,000 children have 
been fitted with these calipers.  
Dr. Kalam took up academic pursuit as Professor, 
Technology & Societal Transformation at Anna University, 
Chennai from November 2001 and was involved in teaching
-----
Score: 0.5730
Text: Vision 2020.  He has addressed several children science 
congresses across the country.  
Dr. Kalam is passionate about bringing rural prosperity 
through PURA (Providing Urban Amenities to Rural Areas), in 
which science and technology has to play a key role. Based on 
his diverse experience he has been propagating the concept of 
World 
Knowledge 
Platform 
through 
which 
the 
core
-----
Score: 0.5218
Text: responsible for evolving policies, strategies and missions for 
many development applications. Dr. 

  qdrant_search_results = client.search(


LLM

In [26]:
llm_model_name = "google/flan-t5-base"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

llm_pipe = pipeline(
    "text2text-generation",
    model=llm_model,
    tokenizer=llm_tokenizer,
    max_length=512,
    device=0 if torch.cuda.is_available() else -1
)


Device set to use cpu


In [27]:
qdrant_context = "\n".join(qdrant_relevant_docs)

print(f"\n🧠 Question: {query_text}")

final_input = f"""You are an expert assistant.
Answer the following question strictly only based on the provided context."

Context:
{qdrant_context}

Question: {query_text}
Answer:"""

response = llm_pipe(final_input)[0]["generated_text"]
print("\n✅Qdrant Answer:\n", response)



🧠 Question: Why abdul kalam is famous?

✅Qdrant Answer:
 Dr. Kalam took up academic pursuit as Professor, Technology & Societal Transformation at Anna University, Chennai from November 2001 and was involved in teaching Vision 2020. He has addressed several children science congresses across the country. Dr. Kalam is passionate about bringing rural prosperity through PURA (Providing Urban Amenities to Rural Areas), in which science and technology has to play a key role. Based on his diverse experience he has been propagating the concept of World Knowledge Platform through which the core responsible for evolving policies, strategies and missions for many development applications. Dr. Kalam was also the Chairman, Ex-officio, of the Scientific Advisory Committee to the Cabinet (SAC-C) and piloted India Millennium Mission 2020. He has addressed several children science congresses across the country. Dr. Kalam took up academic pursuit as Professor, Technology & Societal Transformation at An

Milvus

In [28]:
!pip uninstall grpcio grpcio-status pymilvus -y
!pip install grpcio==1.67.1 grpcio-status==1.67.1
!pip install pymilvus==2.5.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: grpcio 1.67.1
Uninstalling grpcio-1.67.1:
  Successfully uninstalled grpcio-1.67.1
Found existing installation: grpcio-status 1.67.1
Uninstalling grpcio-status-1.67.1:
  Successfully uninstalled grpcio-status-1.67.1
Found existing installation: pymilvus 2.5.0
Uninstalling pymilvus-2.5.0:
  Successfully uninstalled pymilvus-2.5.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting grpcio==1.67.1
  Using cached grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting grpcio-status==1.67.1
  Using cached grpcio_status-1.67.1-py3-none-any.whl.metadata (1.1 kB)
Using cached grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
Using cached grpcio_status-1.67.1-py3-none-any.whl (14 kB)
Installing collected packages: grpcio, grpcio-status
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [grpcio-status]
[1A[2K[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 1.0.10 requires mmh3>=4.0.1, but you have mmh3 3.0.0 which is incompatible.
grpcio-health-checking 1.71.0 requires grpcio>=1.71.0, but you have grpcio 1.67.1 which is incompatible.
gr

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting pymilvus==2.5.0
  Using cached pymilvus-2.5.0-py3-none-any.whl.metadata (5.7 kB)
Using cached pymilvus-2.5.0-py3-none-any.whl (212 kB)
Installing collected packages: pymilvus
Successfully installed pymilvus-2.5.0


wget https://github.com/milvus-io/milvus/releases/download/v2.5.13/milvus-standalone-docker-compose.yml -O docker-compose.yml

sudo docker compose up -d

sudo docker compose down


In [42]:
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility

connections.connect("default", host="localhost", port="19530")

In [44]:
collection_name = "pdf_rag"

if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768)
]

schema = CollectionSchema(fields)
collection = Collection(name=collection_name, schema=schema)

collection.create_index(field_name="embedding", index_params={
    "metric_type": "COSINE", "index_type": "FLAT", "params": {}
})

collection.load()

In [45]:
import time
def safe_insert_data(collection, documents, embeddings, batch_size=20):
    """Insert data in batches with proper error handling"""
    
    total_docs = len(documents)
    print(f"Starting insertion of {total_docs} documents in batches of {batch_size}")
    
    for i in range(0, total_docs, batch_size):
        try:
            # Get batch data
            end_idx = min(i + batch_size, total_docs)
            batch_docs = documents[i:end_idx]
            batch_embs = embeddings[i:end_idx].tolist()
            
            # CRITICAL: Check document length and truncate if necessary
            processed_docs = []
            for doc in batch_docs:
                if len(doc) > 2000:  # Max length we set in schema
                    processed_docs.append(doc[:1990] + "...")  # Truncate with ellipsis
                else:
                    processed_docs.append(doc)
            
            # Insert batch - CORRECT FORMAT: [field1_data, field2_data]
            # Note: Don't include 'id' field data since auto_id=True
            insert_result = collection.insert([processed_docs, batch_embs])
            
            # Flush to ensure data is written
            collection.flush()
            
            print(f"✅ Inserted batch {i+1}-{end_idx}: {len(insert_result.primary_keys)} records")
            
            # Small delay to prevent overwhelming the system
            time.sleep(0.5)
            
        except Exception as e:
            print(f"❌ Failed to insert batch {i+1}-{end_idx}: {e}")
            print(f"Error type: {type(e).__name__}")
            
            # Try to diagnose the issue
            if "string length" in str(e).lower():
                print("💡 Issue: Document too long. Increase max_length in schema or truncate documents.")
            elif "dimension" in str(e).lower():
                print("💡 Issue: Embedding dimension mismatch. Check your embedding model output.")
            elif "connection" in str(e).lower():
                print("💡 Issue: Connection problem. Check if Milvus is running.")
            
            return False
    collection.flush()
    print(f"✅ Total records in collection: {collection.num_entities}")
    return True

In [46]:
# Perform the insertion
success = safe_insert_data(collection, documents, embeddings, batch_size=20)

if success:
    print("🎉 All data inserted successfully!")

Starting insertion of 18 documents in batches of 20
✅ Inserted batch 1-18: 18 records
✅ Total records in collection: 18
🎉 All data inserted successfully!


Milvus Results

In [47]:
search_params = {"metric_type": "COSINE","params": {}}

results = collection.search(data=[query_embedding],anns_field="embedding",param=search_params,limit=3,output_fields=["content"])

print(f"\n🔍 Search results for: '{query_text}'")

milvus_relavant_docs=[]
for i, hit in enumerate(results[0]):
    print(f"Result {i+1}:")
    print(f"Score: {hit.score:.4f}")
    print(f"Content: {hit.entity.get('content')[:]}")
    milvus_relavant_docs.append(hit.entity.get('content')[:])
    print()


🔍 Search results for: 'Why abdul kalam is famous?'
Result 1:
Score: 0.5864
Content: initiated the use of carbon-carbon and carbon-polymer 
materials for production of floor reaction orthosis calipers 
which has reduced the weight of the caliper to 1/10th of the 
original weight during 1995 – 1996. Over 50,000 children have 
been fitted with these calipers.  
Dr. Kalam took up academic pursuit as Professor, 
Technology & Societal Transformation at Anna University, 
Chennai from November 2001 and was involved in teaching

Result 2:
Score: 0.5730
Content: Vision 2020.  He has addressed several children science 
congresses across the country.  
Dr. Kalam is passionate about bringing rural prosperity 
through PURA (Providing Urban Amenities to Rural Areas), in 
which science and technology has to play a key role. Based on 
his diverse experience he has been propagating the concept of 
World 
Knowledge 
Platform 
through 
which 
the 
core

Result 3:
Score: 0.5218
Content: responsible for ev

In [34]:
milvus_context = "\n".join(milvus_relavant_docs)

print(f"\n🧠 Question: {query_text}")

final_input = f"""You are an expert assistant.
Answer the following question strictly only based on the provided context."

Context:
{milvus_context}

Question: {query_text}
Answer:"""

milvus_response = llm_pipe(final_input)[0]["generated_text"]
print("\n✅Milvus Answer:\n", milvus_response)



🧠 Question: Why abdul kalam is famous?

✅Milvus Answer:
 Dr. Kalam took up academic pursuit as Professor, Technology & Societal Transformation at Anna University, Chennai from November 2001 and was involved in teaching Vision 2020. He has addressed several children science congresses across the country. Dr. Kalam is passionate about bringing rural prosperity through PURA (Providing Urban Amenities to Rural Areas), in which science and technology has to play a key role. Based on his diverse experience he has been propagating the concept of World Knowledge Platform through which the core responsible for evolving policies, strategies and missions for many development applications. Dr. Kalam was also the Chairman, Ex-officio, of the Scientific Advisory Committee to the Cabinet (SAC-C) and piloted India Millennium Mission 2020. He has addressed several children science congresses across the country. Dr. Kalam took up academic pursuit as Professor, Technology & Societal Transformation at An

In [35]:
import psycopg
from pgvector.psycopg import register_vector

# Connect to DB and register pgvector
conn = psycopg.connect(
    host='localhost',
    port='5432',
    dbname='pgvector',
    user='postgres',
    password='R.Karthik@04',
    autocommit=True
)

conn.execute('CREATE EXTENSION IF NOT EXISTS vector')
register_vector(conn)

# Drop and create table
conn.execute('DROP TABLE IF EXISTS rag_chunks')
conn.execute('CREATE TABLE rag_chunks (id bigserial PRIMARY KEY, content text, embedding vector(768), metadata text)')

# Store data
cur = conn.cursor()
with cur.copy('COPY rag_chunks (content, embedding, metadata) FROM STDIN WITH (FORMAT BINARY)') as copy:
    copy.set_types(['text', 'vector', 'text'])
    for content, embedding, meta in zip(documents, embeddings, metadatas):
        copy.write_row([content, embedding, str(meta)])

In [36]:
def retrieve_context(query: str, top_k=3):
    query_embedding1 = get_embedding([query])[0]  # shape (768,)

    # Fetch content along with similarity score
    result = conn.execute(
        '''
        SELECT content, embedding <=> %s AS score
        FROM rag_chunks
        ORDER BY score
        LIMIT %s
        ''',
        (np.array(query_embedding1), top_k)
    ).fetchall()

    print(f"\nTop {top_k} chunks for query: '{query}'\n" + "-"*60)
    for i, (content, score) in enumerate(result, start=1):
        print(f"\nChunk {i} (Score: {score:.4f}):\n{content}\n" + "-"*60)

    context = '\n\n'.join([row[0] for row in result])
    return context


In [37]:
pgvector_context=retrieve_context(query_text)


Top 3 chunks for query: 'Why abdul kalam is famous?'
------------------------------------------------------------

Chunk 1 (Score: 0.4136):
initiated the use of carbon-carbon and carbon-polymer 
materials for production of floor reaction orthosis calipers 
which has reduced the weight of the caliper to 1/10th of the 
original weight during 1995 – 1996. Over 50,000 children have 
been fitted with these calipers.  
Dr. Kalam took up academic pursuit as Professor, 
Technology & Societal Transformation at Anna University, 
Chennai from November 2001 and was involved in teaching
------------------------------------------------------------

Chunk 2 (Score: 0.4270):
Vision 2020.  He has addressed several children science 
congresses across the country.  
Dr. Kalam is passionate about bringing rural prosperity 
through PURA (Providing Urban Amenities to Rural Areas), in 
which science and technology has to play a key role. Based on 
his diverse experience he has been propagating the concept o

In [38]:
print(f"\n🧠 Question: {query_text}")

final_input = f"""You are an expert assistant.
Answer the following question strictly only based on the provided context."

Context:
{pgvector_context}

Question: {query_text}
Answer:"""

pgvector_response = llm_pipe(final_input)[0]["generated_text"]
print("\n✅Pgvector Answer:\n", milvus_response)



🧠 Question: Why abdul kalam is famous?

✅Pgvector Answer:
 Dr. Kalam took up academic pursuit as Professor, Technology & Societal Transformation at Anna University, Chennai from November 2001 and was involved in teaching Vision 2020. He has addressed several children science congresses across the country. Dr. Kalam is passionate about bringing rural prosperity through PURA (Providing Urban Amenities to Rural Areas), in which science and technology has to play a key role. Based on his diverse experience he has been propagating the concept of World Knowledge Platform through which the core responsible for evolving policies, strategies and missions for many development applications. Dr. Kalam was also the Chairman, Ex-officio, of the Scientific Advisory Committee to the Cabinet (SAC-C) and piloted India Millennium Mission 2020. He has addressed several children science congresses across the country. Dr. Kalam took up academic pursuit as Professor, Technology & Societal Transformation at 