In [1]:
# notebooks/inspect_stores.ipynb

import os
import sys
import pickle
import chromadb
from langchain.storage import LocalFileStore
from langchain.storage._lc_store import create_kv_docstore
from langchain_huggingface import HuggingFaceEmbeddings

# --- PATH MANAGEMENT ---
# This block makes the notebook runnable from the 'notebooks' directory
try:
    notebook_dir = os.getcwd()
    PROJECT_ROOT = os.path.abspath(os.path.join(notebook_dir, os.pardir))
    if PROJECT_ROOT not in sys.path:
        sys.path.append(PROJECT_ROOT)
    print(f"Project Root set to: {PROJECT_ROOT}")
except Exception as e:
    print(f"Error setting up paths: {e}")
    exit()

# --- 1. CONFIGURATION: Choose what to inspect ---
SOURCE_FILE_TO_INSPECT = "nexora_thermostat_pro_manual.md"
SECTION_TITLE_TO_INSPECT = "Installation Guide" 

# --- 2. Define paths to persistent stores ---
DB_PATH = os.path.join(PROJECT_ROOT, "vector_db")
STORE_PATH = os.path.join(PROJECT_ROOT, "parent_docstore")
PARENT_LIST_PATH = os.path.join(PROJECT_ROOT, "parents.pkl")
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# --- 3. Load parent documents from the pickled file ---
print("\nLoading parent documents from 'parents.pkl'...")
try:
    with open(PARENT_LIST_PATH, 'rb') as f:
        all_parent_docs = pickle.load(f)
    print(f"Loaded {len(all_parent_docs)} total parent documents.\n")
except FileNotFoundError:
    print(f"❌ Error: 'parents.pkl' not found. Please run 'scripts/ingest.py' first.")
    exit()

# --- 4. Find the specific parent document we want to visualize ---
target_parent = None
for doc in all_parent_docs:
    metadata_title = doc.metadata.get("section_title", "")
    if (doc.metadata.get("source") == SOURCE_FILE_TO_INSPECT and
        SECTION_TITLE_TO_INSPECT in metadata_title):
        target_parent = doc
        break

if not target_parent:
    print(f"❌ Could not find the specified parent document. Check your configuration.")
    exit()

# --- 5. Display the Parent Document ---
parent_doc_id = target_parent.metadata.get("doc_id")
print("="*50)
print("👁️ PARENT DOCUMENT FOUND 👁️")
print("="*50)
print(f"Source: {target_parent.metadata.get('source')}")
print(f"Section Title: {target_parent.metadata.get('section_title')}")
print(f"Parent Doc ID: {parent_doc_id}\n")
print("--- Parent Content ---")
print(target_parent.page_content)
print("="*50)

# --- 6. Connect to ChromaDB and find all children with the same doc_id ---
print("\n\n" + "="*50)
print("👶 FINDING ALL ASSOCIATED CHILD CHUNKS FROM VECTOR_DB 👶")
print("="*50)
try:
    client = chromadb.PersistentClient(path=DB_PATH)
    collection = client.get_collection(name="langchain")
    
    child_chunks = collection.get(
        where={"doc_id": parent_doc_id},
        include=["metadatas", "documents"]
    )
    
    print(f"Found {len(child_chunks['ids'])} child chunks linked to this parent.\n")
    
    for i, (metadata, document) in enumerate(zip(child_chunks['metadatas'], child_chunks['documents'])):
        print(f"--- Child Chunk #{i+1} ---")
        print(f"Subsection Title: {metadata.get('subsection_title')}")
        print(f"Linked Doc ID: {metadata.get('doc_id')}")
        print("--- Child Content ---")
        print(document)
        print("-" * 25 + "\n")

except Exception as e:
    print(f"❌ Could not connect to or query ChromaDB. Error: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Project Root set to: c:\Users\karth\nexora-sentiobot

Loading parent documents from 'parents.pkl'...
Loaded 85 total parent documents.

👁️ PARENT DOCUMENT FOUND 👁️
Source: nexora_thermostat_pro_manual.md
Section Title: 4. Installation Guide
Parent Doc ID: 33a9263a-2484-5d3a-93f1-d1db9f8dbb2e

--- Parent Content ---
## 4. Installation Guide
*Tools Required:* Phillips Screwdriver, Drill with small bit (optional), Smartphone with Nexora App.

### 4.1 Removing Your Old Thermostat
1.  **Power Off HVAC:** Go to your home's main electrical panel and turn off the circuit breaker that controls your heating and air conditioning system.
2.  **Remove Old Cover:** Gently pull the cover off your old thermostat. Most models snap off or have small tabs.
3.  **Photograph & Label Wires:** Before disconnecting any wires, take a clear photo of the current wiring configuration. Use the included wire labels to mark each wire according to the terminal it's connected to (e.g., R, C, W, Y, G).
4.  **Disconnect

In [1]:
# notebooks/inspect_summaries.ipynb

# Cell 1: Setup and Configuration
import os
import sys
import pickle
import chromadb
from langchain.storage import LocalFileStore
from langchain.storage._lc_store import create_kv_docstore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.docstore.document import Document

# --- PATH MANAGEMENT ---
try:
    notebook_dir = os.getcwd()
    PROJECT_ROOT = os.path.abspath(os.path.join(notebook_dir, os.pardir))
    if PROJECT_ROOT not in sys.path:
        sys.path.append(PROJECT_ROOT)
    print(f"Project Root set to: {PROJECT_ROOT}")
except Exception as e:
    print(f"Error setting up paths: {e}")
    exit()

# --- CONFIGURATION ---
# Change these values to inspect any document and section you want
SOURCE_FILE_TO_INSPECT = "lumiglow_smart_lighting_manual.md"
SECTION_TITLE_TO_INSPECT = "Technical Specifications" 

# --- Define paths to persistent stores ---
DB_PATH = os.path.join(PROJECT_ROOT, "vector_db")
STORE_PATH = os.path.join(PROJECT_ROOT, "parent_docstore")
PARENT_LIST_PATH = os.path.join(PROJECT_ROOT, "parents.pkl")
SUMMARIES_PATH = os.path.join(PROJECT_ROOT, "summaries")

print("\nSetup complete. Ready to inspect.")

  from .autonotebook import tqdm as notebook_tqdm


Project Root set to: c:\Users\karth\nexora-sentiobot

Setup complete. Ready to inspect.


In [3]:
# Cell 2: View Parent and its Summary

# 1. Load all parent documents from the pickled file
with open(PARENT_LIST_PATH, 'rb') as f:
    all_parent_docs = pickle.load(f)

# 2. Find the specific parent document we want to visualize
target_parent = None
for doc in all_parent_docs:
    metadata_title = doc.metadata.get("section_title", "")
    if (doc.metadata.get("source") == SOURCE_FILE_TO_INSPECT and
        SECTION_TITLE_TO_INSPECT in metadata_title):
        target_parent = doc
        break

if not target_parent:
    print(f"❌ Could not find the specified parent document. Check your configuration.")
else:
    parent_doc_id = target_parent.metadata.get("doc_id")

    # 3. Display the original Parent Document
    print("="*50)
    print("👁️ ORIGINAL PARENT DOCUMENT (from parent_docstore) 👁️")
    print("="*50)
    print(f"Source: {target_parent.metadata.get('source')}")
    print(f"Section Title: {target_parent.metadata.get('section_title')}")
    print(f"Doc ID: {parent_doc_id}\n")
    print("--- Full Content ---")
    print(target_parent.page_content)
    print("="*50)
    
    # 4. Find and display the corresponding summary
    summary_file_path = os.path.join(SUMMARIES_PATH, f"{parent_doc_id}.txt")
    
    print("\n\n" + "="*50)
    print("📄 AI-GENERATED SUMMARY (from /summaries folder) 📄")
    print("="*50)
    try:
        with open(summary_file_path, 'r', encoding='utf-8') as f:
            summary_content = f.read()
        print("--- Summary Content ---")
        print(summary_content)
    except FileNotFoundError:
        print("❌ Summary file not found for this document.")
    print("="*50)

👁️ ORIGINAL PARENT DOCUMENT (from parent_docstore) 👁️
Source: lumiglow_smart_lighting_manual.md
Section Title: 7. Technical Specifications
Doc ID: 68913b52-8270-5cf3-a7be-d3363b6bf3e2

--- Full Content ---
## 7. Technical Specifications
-   **Wattage:** 9W LED (60W incandescent equivalent)
-   **Luminosity:** 800 Lumens
-   **Connectivity:** Wi-Fi (IEEE 802.11 b/g/n, 2.4GHz), Bluetooth 4.2
-   **Color Options:** 16+ million RGB, Tunable White (2700K - 6500K)
-   **Expected Lifespan:** 25,000 hours
-   **Socket:** E27 Standard Screw Base
-   **Operating Voltage:** 220-240V ~ 50/60Hz

---


📄 AI-GENERATED SUMMARY (from /summaries folder) 📄
--- Summary Content ---
Technical specifications: 9W LED (60W incandescent equivalent), 800 Lumens. Connectivity: Wi-Fi (IEEE 802.11 b/g/n, 2.4GHz), Bluetooth 4.2. Color options: 16+ million RGB, Tunable White (2700K - 6500K). Expected lifespan: 25,000 hours. Socket: E27 Standard Screw Base. Operating voltage: 220-240V ~ 50/60Hz.


In [5]:
# Cell 3: Simulate a Full RAG Retrieval
from langchain_chroma import Chroma
# --- CONFIGURATION ---
# A sample question related to the section you're inspecting
SAMPLE_QUERY = "What is the expected lifespan of a LumiGlow bulb?"

# 1. Initialize the components
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory=DB_PATH, embedding_function=embedding_model)
byte_store = LocalFileStore(STORE_PATH)
docstore = create_kv_docstore(byte_store)

print("="*50)
print(f"⚡ SIMULATING RAG PIPELINE FOR QUERY: '{SAMPLE_QUERY}' ⚡")
print("="*50)

# 2. Search the vector store to find the most relevant SUMMARY
print("\n--- Step 1: Searching vector store for the best SUMMARY ---")
# The vector store contains the embeddings of the summaries
retrieved_summaries = vectorstore.similarity_search(SAMPLE_QUERY, k=1)

if not retrieved_summaries:
    print("❌ No relevant summaries found in the vector store.")
else:
    best_summary = retrieved_summaries[0]
    retrieved_doc_id = best_summary.metadata.get("doc_id")
    
    print("✅ Best matching summary found:")
    print(f"   Source: {best_summary.metadata.get('source')}")
    print(f"   Section: {best_summary.metadata.get('section_title')}")
    print(f"   Doc ID: {retrieved_doc_id}")
    print("\n--- Summary Content ---")
    print(best_summary.page_content)
    
    # 3. Use the doc_id from the summary to retrieve the FULL PARENT document
    print("\n\n--- Step 2: Retrieving the FULL PARENT document from the docstore ---")
    
    # This is the "search-small-retrieve-big" step
    final_document = docstore.mget([retrieved_doc_id])[0]
    
    if not final_document:
        print(f"❌ Could not retrieve parent document with ID: {retrieved_doc_id}")
    else:
        print("✅ Full parent document retrieved successfully!")
        print(f"   This is the final context that would be sent to the LLM.")
        print("\n--- Full Parent Content ---")
        print(final_document.page_content)
        print("="*50)

⚡ SIMULATING RAG PIPELINE FOR QUERY: 'What is the expected lifespan of a LumiGlow bulb?' ⚡

--- Step 1: Searching vector store for the best SUMMARY ---
✅ Best matching summary found:
   Source: lumiglow_smart_lighting_manual.md
   Section: 7. Technical Specifications
   Doc ID: 68913b52-8270-5cf3-a7be-d3363b6bf3e2

--- Summary Content ---
Technical specifications: 9W LED (60W incandescent equivalent), 800 Lumens. Connectivity: Wi-Fi (IEEE 802.11 b/g/n, 2.4GHz), Bluetooth 4.2. Color options: 16+ million RGB, Tunable White (2700K - 6500K). Expected lifespan: 25,000 hours. Socket: E27 Standard Screw Base. Operating voltage: 220-240V ~ 50/60Hz.


--- Step 2: Retrieving the FULL PARENT document from the docstore ---
✅ Full parent document retrieved successfully!
   This is the final context that would be sent to the LLM.

--- Full Parent Content ---
## 7. Technical Specifications
-   **Wattage:** 9W LED (60W incandescent equivalent)
-   **Luminosity:** 800 Lumens
-   **Connectivity:** Wi-Fi