In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

print(bool(os.getenv("OPENAI_API_KEY")))

True


In [2]:
from docx import Document
from langchain_core.documents import Document as LCDocument

def load_docx(path: str):
    doc = Document(path)
    text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
    return [LCDocument(page_content=text)]

documents = load_docx("Aranya_River_Project.docx")

documents[0].page_content[:500]


'The Aranya River Project: A Case Study in Infrastructure, Policy, and Human Impact\n\n1. Background and Origin\nThe Aranya River originates in the eastern slopes of the Vindhya mountain range and flows through three Indian states before merging into the Bay of Bengal. For centuries, it supported agriculture, fisheries, and small riverine trade. Seasonal flooding was common but largely predictable, and local communities adapted their farming cycles accordingly.\nIn 2008, recurring droughts in the dow'

In [3]:
from langchain_text_splitters import TextSplitter

class FixedLineTextSplitter(TextSplitter):
    def __init__(self, lines_per_chunk=5):
        super().__init__()
        self.lines_per_chunk = lines_per_chunk

    def split_text(self, text):
        lines = text.splitlines()
        return [
            "\n".join(lines[i:i + self.lines_per_chunk])
            for i in range(0, len(lines), self.lines_per_chunk)
        ]

splitter = FixedLineTextSplitter(lines_per_chunk=4)
chunks = splitter.split_documents(documents)

for i, chunk in enumerate(chunks):
    print(f"Chunk {i}:")
    print(chunk.page_content)
    print()


Chunk 0:
The Aranya River Project: A Case Study in Infrastructure, Policy, and Human Impact

1. Background and Origin
The Aranya River originates in the eastern slopes of the Vindhya mountain range and flows through three Indian states before merging into the Bay of Bengal. For centuries, it supported agriculture, fisheries, and small riverine trade. Seasonal flooding was common but largely predictable, and local communities adapted their farming cycles accordingly.

Chunk 1:
In 2008, recurring droughts in the downstream regions triggered renewed interest in large-scale water storage and regulation. The central government commissioned a feasibility study to assess whether a multi-purpose dam could address irrigation shortages, generate hydroelectric power, and control floods.
The project was officially named the Aranya Integrated River Management Project (AIRMP).
2. Initial Feasibility Study (2009–2011)
The feasibility study was led by HydroSys Consultants, a public-private consortium.

In [4]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_store"
)

vectorstore.persist()



  vectorstore.persist()


In [6]:

query = "What is the Aranya River Project?"

results = vectorstore.similarity_search_with_score(
    query=query,
    k=5
)

for i, (doc, score) in enumerate(results, 1):
    print(f"\n--- Result {i} | Score: {score} ---")
    print(doc.page_content)



--- Result 1 | Score: 0.5425897240638733 ---
The Aranya River Project: A Case Study in Infrastructure, Policy, and Human Impact

1. Background and Origin
The Aranya River originates in the eastern slopes of the Vindhya mountain range and flows through three Indian states before merging into the Bay of Bengal. For centuries, it supported agriculture, fisheries, and small riverine trade. Seasonal flooding was common but largely predictable, and local communities adapted their farming cycles accordingly.

--- Result 2 | Score: 0.7135130763053894 ---
In 2008, recurring droughts in the downstream regions triggered renewed interest in large-scale water storage and regulation. The central government commissioned a feasibility study to assess whether a multi-purpose dam could address irrigation shortages, generate hydroelectric power, and control floods.
The project was officially named the Aranya Integrated River Management Project (AIRMP).
2. Initial Feasibility Study (2009–2011)
The feasib