In [1]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("C:/Users/golla/Downloads/sciencerag.pdf")

document = loader.load()

document

[Document(metadata={'source': 'C:/Users/golla/Downloads/sciencerag.pdf', 'page': 0}, page_content='Photosynthesis in Plants\nPhotosynthesis is a biological process that occurs in plants, algae, and some bacteria, where light\nenergy is \nconverted into chemical energy in the form of glucose (a sugar) and oxygen. This process is vital for\nthe survival \nof life on Earth, as it provides the base for the food chain and releases oxygen into the atmosphere.\n### The Process of Photosynthesis\nPhotosynthesis takes place primarily in the leaves of plants, within specialized organelles called\nchloroplasts.\nThe process can be broken down into two main stages: the light-dependent reactions and the Calvin\ncycle \n(light-independent reactions).\n1. **Light-dependent reactions (Occurs in thylakoid membranes of chloroplasts)**\nThese reactions occur when light is absorbed by chlorophyll pigments in the thylakoid membranes.\nWater molecules are \nsplit into oxygen, protons (H+), and electrons thr

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)

In [3]:
chunks = text_splitter.split_documents(document)

for chunk in chunks:
    print(chunk)
    print("----------------")

page_content='Photosynthesis in Plants
Photosynthesis is a biological process that occurs in plants, algae, and some bacteria, where light
energy is 
converted into chemical energy in the form of glucose (a sugar) and oxygen. This process is vital for
the survival 
of life on Earth, as it provides the base for the food chain and releases oxygen into the atmosphere.
### The Process of Photosynthesis
Photosynthesis takes place primarily in the leaves of plants, within specialized organelles called
chloroplasts.
The process can be broken down into two main stages: the light-dependent reactions and the Calvin
cycle 
(light-independent reactions).
1. **Light-dependent reactions (Occurs in thylakoid membranes of chloroplasts)**
These reactions occur when light is absorbed by chlorophyll pigments in the thylakoid membranes.
Water molecules are 
split into oxygen, protons (H+), and electrons through photolysis. The electrons move through the
electron' metadata={'source': 'C:/Users/golla/Downlo

In [4]:
from sentence_transformers import SentenceTransformer

embedding = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5',trust_remote_code=True)

  from tqdm.autonotebook import tqdm, trange





In [5]:
em = embedding.encode('My name is hemanth')

In [6]:
em

array([-0.09014966, -0.32934108,  0.1366087 , ..., -0.21335939,
       -0.52650166, -0.62721777], dtype=float32)

In [7]:
len(em)

1024

In [1]:
from pymilvus import connections, Collection, db, utility, DataType,CollectionSchema,FieldSchema

try:
    connections.connect(host='127.0.0.1', port='19530')
    print("Connected to Milvus successfully!")
except Exception as e:
    print("Failed to connect:", e)


ModuleNotFoundError: No module named 'pymilvus'

In [59]:
db.list_database()

['default', 'firstdb']

In [60]:
db.using_database('firstdb')

In [61]:
utility.list_collections()

[]

In [62]:
id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True)
source_field = FieldSchema(name="source", dtype=DataType.VARCHAR,max_length=255, is_primary=False)
page_field = FieldSchema(name="page", dtype=DataType.INT64)
embedding_field = FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR,dim=1024)
content_field = FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=2048)

schema = CollectionSchema(fields=[id_field,source_field, page_field, embedding_field, content_field], description="Collection for storing document embeddings")

In [63]:
collection = Collection(name="Science", schema=schema)


In [65]:
utility.list_collections()

['Science']

In [66]:
#index

index_params = {
    'metric_type': 'L2',   #eucluedian distance
    'index_type': 'HNSW',
    'params': {
        'M': 16,
        'efConstruction': 200
    }
} 

In [68]:
collection.create_index(field_name='embeddings', index_params=index_params)

Status(code=0, message=)

In [70]:
collection.load()

In [71]:
chunks

[Document(metadata={'source': 'C:/Users/golla/Downloads/sciencerag.pdf', 'page': 0}, page_content='Photosynthesis in Plants\nPhotosynthesis is a biological process that occurs in plants, algae, and some bacteria, where light\nenergy is \nconverted into chemical energy in the form of glucose (a sugar) and oxygen. This process is vital for\nthe survival \nof life on Earth, as it provides the base for the food chain and releases oxygen into the atmosphere.\n### The Process of Photosynthesis\nPhotosynthesis takes place primarily in the leaves of plants, within specialized organelles called\nchloroplasts.\nThe process can be broken down into two main stages: the light-dependent reactions and the Calvin\ncycle \n(light-independent reactions).\n1. **Light-dependent reactions (Occurs in thylakoid membranes of chloroplasts)**\nThese reactions occur when light is absorbed by chlorophyll pigments in the thylakoid membranes.\nWater molecules are \nsplit into oxygen, protons (H+), and electrons thr

In [72]:
sources = [chunk.metadata['source'] for chunk in chunks]
pages = [chunk.metadata['page'] for chunk in chunks]
page_content = [chunk.page_content for chunk in chunks]
content_embeddings = [embedding.encode(chunk.page_content) for chunk in chunks]


In [73]:
collection.insert([sources,pages,content_embeddings,page_content])

(insert count: 10, delete count: 0, upsert count: 0, timestamp: 455820572934799363, success count: 10, err count: 0

In [77]:
def search_milvus(query, collection,limit=3): 
    query_embedding = embedding.encode(query)
    search_params = {
    'metric_type': 'L2',   #eucluedian distance
    'index_type': 'HNSW',
    'params': {
        'M': 16,
        'efConstruction': 200
    }
}
    results = collection.search(
        data = [query_embedding],
        anns_field = 'embeddings',
        param=search_params,
        limit=limit,
        output_fields=['source','page','content']
    )
    return results

In [84]:
results = search_milvus('What is photosynthesis',collection,3)

In [87]:
for result in results[0]:
    print(result.entity.get('source'),result.entity.get('page'),result.entity.get('content'))

C:/Users/golla/Downloads/sciencerag.pdf 0 Photosynthesis in Plants
Photosynthesis is a biological process that occurs in plants, algae, and some bacteria, where light
energy is 
converted into chemical energy in the form of glucose (a sugar) and oxygen. This process is vital for
the survival 
of life on Earth, as it provides the base for the food chain and releases oxygen into the atmosphere.
### The Process of Photosynthesis
Photosynthesis takes place primarily in the leaves of plants, within specialized organelles called
chloroplasts.
The process can be broken down into two main stages: the light-dependent reactions and the Calvin
cycle 
(light-independent reactions).
1. **Light-dependent reactions (Occurs in thylakoid membranes of chloroplasts)**
These reactions occur when light is absorbed by chlorophyll pigments in the thylakoid membranes.
Water molecules are 
split into oxygen, protons (H+), and electrons through photolysis. The electrons move through the
electron
C:/Users/golla/

In [92]:
def filter_results(results):
    data = {}
    distance = []
    page_numbers = []
    contents = []
    sources = []
    for result in results[0]:
        distance.append(result.distance)
        page_numbers.append(result.entity.get('page'))
        contents.append(result.entity.get('content'))
        sources.append(result.entity.get('source'))
    data['distance'] = distance
    data['page_numbers'] = page_numbers
    data['contents'] = contents
    data['sources'] = sources
    return data

In [93]:
filter_results(results)

{'distance': [287.5445251464844, 375.74505615234375, 421.83221435546875],
 'page_numbers': [0, 0, 1],
 'contents': ['Photosynthesis in Plants\nPhotosynthesis is a biological process that occurs in plants, algae, and some bacteria, where light\nenergy is \nconverted into chemical energy in the form of glucose (a sugar) and oxygen. This process is vital for\nthe survival \nof life on Earth, as it provides the base for the food chain and releases oxygen into the atmosphere.\n### The Process of Photosynthesis\nPhotosynthesis takes place primarily in the leaves of plants, within specialized organelles called\nchloroplasts.\nThe process can be broken down into two main stages: the light-dependent reactions and the Calvin\ncycle \n(light-independent reactions).\n1. **Light-dependent reactions (Occurs in thylakoid membranes of chloroplasts)**\nThese reactions occur when light is absorbed by chlorophyll pigments in the thylakoid membranes.\nWater molecules are \nsplit into oxygen, protons (H+),