In [3]:
# Cell 1: Imports and Path Definitions (Corrected for langchain-community)
import os
from langchain_community.document_loaders import PyPDFDirectoryLoader      # <-- UPDATED
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings  # <-- UPDATED
from langchain_community.vectorstores import Chroma                       # <-- UPDATED

print("--- All libraries imported successfully ---")

# Define the path to the knowledge base directory
KNOWLEDGE_BASE_DIR = "../data/knowledge_base/"

# The closing parenthesis ')' was added at the end of this line
print(f"Knowledge base directory is set to: {os.path.abspath(KNOWLEDGE_BASE_DIR)}")

--- All libraries imported successfully ---
Knowledge base directory is set to: g:\MdEhsanulHaqueKanan GitHub projects\RescueVision\rescue-vision-project\data\knowledge_base


In [4]:
# Cell 2: Load Documents from the Knowledge Base
print("Loading documents from the knowledge base...")

# Initialize the loader
loader = PyPDFDirectoryLoader(KNOWLEDGE_BASE_DIR)

# Load the documents
documents = loader.load()

print(f"--- Successfully loaded {len(documents)} documents. ---")

# Let's inspect the first document to see what we have
print("\n--- Sample content from the first document: ---")
print(documents[0].page_content[:500]) # Print the first 500 characters

Loading documents from the knowledge base...
--- Successfully loaded 10 documents. ---

--- Sample content from the first document: ---
Policy
304
Sanibel Fire and Rescue District
Fire Policy Manual
Copyright Lexipol, LLC 2025/01/14, All Rights Reserved.
Published with permission by Sanibel Fire and Rescue District***DRAFT*** Urban Search and Rescue (USAR) - 1
Urban Search and Rescue (USAR)
304.1   PURPOSE AND SCOPE
The purpose of this policy is to describe the Federal Emergency Management Agency (FEMA) and
Florida Urban Search and Rescue (USAR) Response Systems as resources for disaster response.
304.2   POLICY
It is the policy


In [5]:
# Cell 3: Chunk the Loaded Documents
print("Splitting documents into smaller chunks...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # The max number of characters in a chunk
    chunk_overlap=100   # The number of characters to overlap between chunks
)

chunks = text_splitter.split_documents(documents)

print(f"--- Successfully split {len(documents)} pages into {len(chunks)} chunks. ---")

# Let's inspect a chunk to see the result
print("\n--- Sample content from the first chunk: ---")
print(chunks[0].page_content)

Splitting documents into smaller chunks...
--- Successfully split 10 pages into 27 chunks. ---

--- Sample content from the first chunk: ---
Policy
304
Sanibel Fire and Rescue District
Fire Policy Manual
Copyright Lexipol, LLC 2025/01/14, All Rights Reserved.
Published with permission by Sanibel Fire and Rescue District***DRAFT*** Urban Search and Rescue (USAR) - 1
Urban Search and Rescue (USAR)
304.1   PURPOSE AND SCOPE
The purpose of this policy is to describe the Federal Emergency Management Agency (FEMA) and
Florida Urban Search and Rescue (USAR) Response Systems as resources for disaster response.
304.2   POLICY
It is the policy of the Sanibel Fire and Rescue District to utilize the FEMA and Florida USAR
resources in the event of an urban disaster, as appropriate.
304.3   RESOURCES
USAR is a multi-hazard discipline and may be used for a variety of disasters, including hurricanes,
earthquakes, typhoons, storms, tornadoes, floods, dam failures, technological accidents, terrorist
act

In [6]:
# Cell 4: Create Embeddings and Store in Vector Database
print("Initializing the embedding model...")

# Use a standard, all-purpose embedding model
# This model will be downloaded from Hugging Face the first time you run this
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

print("--- Embedding model initialized. ---")

# Define the path for the persistent vector database
VECTOR_DB_PATH = "../vector_store"

print(f"Creating/loading the vector database at: {os.path.abspath(VECTOR_DB_PATH)}")

# Create the Chroma database from the chunks
# This will process all chunks, create embeddings, and save them to disk
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=VECTOR_DB_PATH
)

print(f"--- Successfully created and saved the vector database with {vectordb._collection.count()} entries. ---")

Initializing the embedding model...


  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

--- Embedding model initialized. ---
Creating/loading the vector database at: g:\MdEhsanulHaqueKanan GitHub projects\RescueVision\rescue-vision-project\vector_store
--- Successfully created and saved the vector database with 27 entries. ---


In [7]:
# Cell 5: Test the Vector Database with a Sample Query
print("--- Testing the retrieval functionality ---")

# The query we want to ask our knowledge base
query = "What is the procedure for a swiftwater rescue?"

print(f"Query: '{query}'")

# Load the persisted database from disk
db = Chroma(
    persist_directory=VECTOR_DB_PATH,
    embedding_function=embeddings
)

# Perform a similarity search
retrieved_docs = db.similarity_search(query, k=3) # k=3 means get the top 3 most relevant chunks

print("\n--- Top 3 most relevant chunks found: ---")
for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Chunk {i+1} (Source: {os.path.basename(doc.metadata.get('source', 'Unknown'))}) ---")
    print(doc.page_content)

--- Testing the retrieval functionality ---
Query: 'What is the procedure for a swiftwater rescue?'


  db = Chroma(



--- Top 3 most relevant chunks found: ---

--- Chunk 1 (Source: fema_sar_operations.pdf) ---
4. Provides first aid and Cardiopulmonary
Resuscitation (CPR)
5. Provides:
a. Safety of self and team members
b. Basic medical care of self, team
members, and survivors
c. Simple decontamination of self and
team members
d. Basic ground support capability for
helicopter operations
6. Coordinates with ground vehicles,
watercraft, and aircraft for support,
transportation, and evacuation
7. Operates in environments with and
without infrastructure, including those
affected by disasters and terrorism; with
compromised access to roadways,
utilities, and transportation; and with
limited availability of shelter, food, and
(Continued)
Not Specified
DESCRIPTION (Continued)
water
EDUCATION Not Specified Not Specified Not Specified Not Specified

--- Chunk 2 (Source: fema_sar_logistics.pdf) ---
activities and hazardous material releases.
USAR task forces have four areas of specialization:
(a) Searches - Fi